19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
54 enum UpdateFrequency {
60 NONE(Integer.MAX_VALUE),
62 private final int time;
64 UpdateFrequency(
int time) {
83 private final KeywordSearchJobSettings
settings;
87 private static final AtomicInteger
instanceCount =
new AtomicInteger(0);
101 private static final Map<Long, Map<Long, IngestStatus>>
ingestStatus =
new HashMap<>();
105 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
106 if (ingestStatusForJob == null) {
107 ingestStatusForJob =
new HashMap<>();
108 ingestStatus.put(ingestJobId, ingestStatusForJob);
111 ingestStatusForJob.put(fileId, status);
112 ingestStatus.put(ingestJobId, ingestStatusForJob);
118 instanceNum = instanceCount.getAndIncrement();
128 logger.log(Level.INFO,
"Initializing instance {0}", instanceNum);
136 throw new IngestModuleException(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.startUp.fileTypeDetectorInitializationException.msg"));
146 if (!server.isRunning()) {
147 String msg = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.badInitMsg");
148 logger.log(Level.SEVERE, msg);
149 String details = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
154 logger.log(Level.WARNING,
"Error checking if Solr server is running while initializing ingest", ex);
156 String msg = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.badInitMsg");
157 String details = NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.tryStopSolrMsg", msg);
167 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg",
172 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
173 boolean hasKeywordsForSearch =
false;
175 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
176 hasKeywordsForSearch =
true;
180 if (!hasKeywordsForSearch) {
182 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
187 stringExtractor =
new StringsTextExtractor(
this);
188 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
189 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
192 final StringBuilder sbScripts =
new StringBuilder();
193 for (
SCRIPT s : KeywordSearchSettings.getStringExtractScripts()) {
194 sbScripts.append(s.name()).append(
" ");
196 logger.log(Level.INFO,
"Using string extract scripts: {0}", sbScripts.toString());
198 textExtractors =
new ArrayList<>();
200 textExtractors.add(
new HtmlTextExtractor(
this));
201 textExtractors.add(
new TikaTextExtractor(
this));
209 if (initialized ==
false)
211 logger.log(Level.WARNING,
"Skipping processing, module not initialized, file: {0}", abstractFile.
getName());
231 if (!startedSearching) {
232 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
234 startedSearching =
true;
246 logger.log(Level.INFO,
"Instance {0}", instanceNum);
248 if (initialized ==
false) {
264 ingestStatus.remove(jobId);
273 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
274 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
276 logger.log(Level.WARNING,
"Error executing Solr query to check number of indexed files/chunks: ", ex);
286 logger.log(Level.INFO,
"stop()");
297 textExtractors.clear();
298 textExtractors = null;
299 stringExtractor = null;
308 int text_ingested = 0;
309 int metadata_ingested = 0;
310 int strings_ingested = 0;
316 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
322 case METADATA_INGESTED:
325 case STRINGS_INGESTED:
328 case SKIPPED_ERROR_TEXTEXTRACT:
331 case SKIPPED_ERROR_INDEXING:
334 case SKIPPED_ERROR_IO:
343 StringBuilder msg =
new StringBuilder();
344 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
345 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
346 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
347 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
348 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
349 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
350 msg.append(
"</table>");
351 String indexStats = msg.toString();
352 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
354 if (error_index > 0) {
356 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
357 }
else if (error_io + error_text > 0) {
358 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
359 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
383 TextExtractor fileExtract = null;
386 for (TextExtractor fe : textExtractors) {
387 if (fe.isSupported(aFile, detectedFormat)) {
393 if (fileExtract == null) {
394 logger.log(Level.INFO,
"No text extractor found for file id:{0}, name: {1}, detected format: {2}",
new Object[]{aFile.getId(), aFile.getName(), detectedFormat});
401 return fileExtract.index(aFile);
413 if (stringExtractor.index(aFile)) {
417 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
421 }
catch (IngesterException ex) {
422 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.
getName() +
"' (id: " + aFile.
getId() +
").", ex);
438 for (TextExtractor extractor : textExtractors) {
439 if (extractor.isContentTypeSpecific() ==
true
440 && extractor.isSupported(aFile, detectedFormat)) {
465 final long size = aFile.
getSize();
467 if ((indexContent ==
false || aFile.
isDir() || size == 0)) {
469 ingester.ingest(aFile,
false);
471 }
catch (IngesterException ex) {
473 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.
getId(), ex);
478 String detectedFormat;
480 detectedFormat = fileTypeDetector.
getFileType(aFile);
482 logger.log(Level.SEVERE, String.format(
"Could not detect format using fileTypeDetector for file: %s", aFile), ex);
488 if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
490 ingester.ingest(aFile,
false);
492 }
catch (IngesterException ex) {
494 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.
getId(), ex);
499 boolean wasTextAdded =
false;
505 logger.log(Level.WARNING,
"Failed to extract text and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
512 }
catch (IngesterException e) {
513 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.
getId() +
", "
516 }
catch (Exception e) {
517 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.
getId() +
", "
524 if (wasTextAdded ==
false) {
int queryNumIndexedFiles()
boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat)
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
List< TextExtractor > textExtractors
static IngestMessage createErrorMessage(String source, String subject, String detailsHtml)
TskData.TSK_DB_FILES_TYPE_ENUM getType()
METADATA_INGESTED
No content, so we just text_ingested metadata.
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static final AtomicInteger instanceCount
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
static final IngestModuleReferenceCounter refCounter
TskData.FileKnown getKnown()
static synchronized SearchRunner getInstance()
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
String getFileType(AbstractFile file)
static Ingester getIngester()
static void warn(String title, String message)
static Logger getLogger(String name)
static final Map< Long, Map< Long, IngestStatus > > ingestStatus
final IngestServices services
static final Logger logger
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.