19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import java.io.Reader;
23 import java.util.HashMap;
24 import java.util.List;
26 import java.util.concurrent.atomic.AtomicInteger;
27 import java.util.logging.Level;
28 import org.openide.util.Lookup;
29 import org.openide.util.NbBundle;
30 import org.openide.util.NbBundle.Messages;
31 import org.openide.util.lookup.Lookups;
65 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
66 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
67 "SolrConnectionCheck.Port=Invalid port number.",
68 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
69 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
70 "CannotRunFileTypeDetection=Unable to run file type detection."
78 private static final List<String> ARCHIVE_MIME_TYPES
81 "application/x-7z-compressed",
82 "application/x-ace-compressed",
83 "application/x-alz-compressed",
85 "application/vnd.ms-cab-compressed",
86 "application/x-cfs-compressed",
87 "application/x-dgc-compressed",
88 "application/x-apple-diskimage",
89 "application/x-gca-compressed",
93 "application/x-rar-compressed",
94 "application/x-stuffit",
95 "application/x-stuffitx",
97 "application/x-archive",
98 "application/x-executable",
102 "application/x-cpio",
103 "application/x-shar",
105 "application/x-bzip",
106 "application/x-bzip2",
107 "application/x-lzip",
108 "application/x-lzma",
109 "application/x-lzop",
111 "application/x-compress");
116 enum StringsExtractOptions {
121 enum UpdateFrequency {
127 NONE(Integer.MAX_VALUE),
129 private final int time;
131 UpdateFrequency(
int time) {
141 private Ingester ingester = null;
147 private boolean startedSearching =
false;
150 private boolean initialized =
false;
152 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
153 private int instanceNum = 0;
166 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
177 synchronized (ingestStatus) {
178 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
179 if (ingestStatusForJob == null) {
180 ingestStatusForJob =
new HashMap<>();
181 ingestStatus.put(ingestJobId, ingestStatusForJob);
183 ingestStatusForJob.put(fileId, status);
184 ingestStatus.put(ingestJobId, ingestStatusForJob);
189 this.settings = settings;
190 instanceNum = instanceCount.getAndIncrement();
199 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
200 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
201 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
202 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
210 if (server.coreIsOpen() ==
false) {
215 Index indexInfo = server.getIndexInfo();
216 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
217 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
219 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
220 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
223 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
232 ingester = Ingester.getDefault();
233 this.context = context;
250 port = Integer.parseInt(properties.getPort());
251 }
catch (NumberFormatException ex) {
253 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
256 kwsService.
tryConnect(properties.getHost(), port);
263 if (!server.isRunning()) {
264 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
268 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
275 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
279 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
280 boolean hasKeywordsForSearch =
false;
282 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
283 hasKeywordsForSearch =
true;
287 if (!hasKeywordsForSearch) {
289 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
295 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
296 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
297 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
300 stringsExtractionContext = Lookups.fixed(stringsConfig);
308 if (initialized ==
false)
310 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
315 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
320 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
336 if (!startedSearching) {
340 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
341 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
342 startedSearching =
true;
354 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
356 if ((initialized ==
false) || (context == null)) {
361 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
362 IngestSearchRunner.getInstance().stopJob(jobId);
368 IngestSearchRunner.getInstance().endJob(jobId);
374 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
376 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
378 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
381 synchronized (ingestStatus) {
382 ingestStatus.remove(jobId);
393 stringsExtractionContext = null;
401 int text_ingested = 0;
402 int metadata_ingested = 0;
403 int strings_ingested = 0;
408 synchronized (ingestStatus) {
409 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
410 if (ingestStatusForJob == null) {
418 case METADATA_INGESTED:
421 case STRINGS_INGESTED:
424 case SKIPPED_ERROR_TEXTEXTRACT:
427 case SKIPPED_ERROR_INDEXING:
430 case SKIPPED_ERROR_IO:
439 StringBuilder msg =
new StringBuilder();
440 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
441 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
442 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
443 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
444 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
445 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
446 msg.append(
"</table>");
447 String indexStats = msg.toString();
448 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
450 if (error_index > 0) {
452 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
453 }
else if (error_io + error_text > 0) {
454 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
455 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
482 imageConfig.
setOCREnabled(KeywordSearchSettings.getOcrOption());
484 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
488 Reader extractedTextReader = extractor.
getReader();
490 return Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, context);
511 Reader extractedTextReader = stringsExtractor.
getReader();
516 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
521 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
534 private void indexFile(AbstractFile aFile,
boolean indexContent) {
537 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
540 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
544 extractStringsAndIndex(aFile);
548 final long size = aFile.getSize();
551 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
556 ingester.indexMetaDataOnly(aFile);
558 }
catch (IngesterException ex) {
560 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
568 String fileType = fileTypeDetector.
getMIMEType(aFile);
572 if (ARCHIVE_MIME_TYPES.contains(fileType)) {
577 ingester.indexMetaDataOnly(aFile);
579 }
catch (IngesterException ex) {
581 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
586 boolean wasTextAdded =
false;
594 if (fileType.equals(
"application/octet-stream")) {
595 extractStringsAndIndex(aFile);
598 if (!extractTextAndIndex(aFile, fileType)) {
606 }
catch (IngesterException e) {
607 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
608 + aFile.getName(), e);
610 }
catch (Exception e) {
611 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
612 + aFile.getName(), e);
616 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
620 TextFileExtractor textFileExtractor =
new TextFileExtractor();
621 Reader textReader = textFileExtractor.getReader(aFile);
622 if (textReader == null) {
623 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
624 }
else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
628 }
catch (IngesterException ex) {
629 logger.log(Level.WARNING,
"Unable to index as unicode", ex);
631 logger.log(Level.INFO,
"Could not extract text with TextFileExtractor", ex);
636 if (wasTextAdded ==
false) {
637 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Lookup stringsExtractionContext
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.