19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import java.io.IOException;
24 import java.io.Reader;
25 import java.util.HashMap;
26 import java.util.List;
28 import java.util.concurrent.atomic.AtomicInteger;
29 import java.util.logging.Level;
30 import java.util.stream.Collectors;
31 import org.apache.tika.mime.MimeTypes;
32 import org.openide.util.Lookup;
33 import org.openide.util.NbBundle;
34 import org.openide.util.NbBundle.Messages;
35 import org.openide.util.lookup.Lookups;
69 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
70 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
71 "SolrConnectionCheck.Port=Invalid port number.",
72 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
73 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
74 "CannotRunFileTypeDetection=Unable to run file type detection."
82 private static final List<String> ARCHIVE_MIME_TYPES
85 "application/x-7z-compressed",
86 "application/x-ace-compressed",
87 "application/x-alz-compressed",
89 "application/vnd.ms-cab-compressed",
90 "application/x-cfs-compressed",
91 "application/x-dgc-compressed",
92 "application/x-apple-diskimage",
93 "application/x-gca-compressed",
97 "application/x-rar-compressed",
98 "application/x-stuffit",
99 "application/x-stuffitx",
100 "application/x-gtar",
101 "application/x-archive",
102 "application/x-executable",
103 "application/x-gzip",
106 "application/x-cpio",
107 "application/x-shar",
109 "application/x-bzip",
110 "application/x-bzip2",
111 "application/x-lzip",
112 "application/x-lzma",
113 "application/x-lzop",
115 "application/x-compress");
120 enum StringsExtractOptions {
125 enum UpdateFrequency {
131 NONE(Integer.MAX_VALUE),
133 private final int time;
135 UpdateFrequency(
int time) {
145 private Ingester ingester = null;
151 private boolean startedSearching =
false;
154 private boolean initialized =
false;
156 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
157 private int instanceNum = 0;
170 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
181 synchronized (ingestStatus) {
182 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
183 if (ingestStatusForJob == null) {
184 ingestStatusForJob =
new HashMap<>();
185 ingestStatus.put(ingestJobId, ingestStatusForJob);
187 ingestStatusForJob.put(fileId, status);
188 ingestStatus.put(ingestJobId, ingestStatusForJob);
193 this.settings = settings;
194 instanceNum = instanceCount.getAndIncrement();
203 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
204 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
205 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
206 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
214 if (server.coreIsOpen() ==
false) {
219 Index indexInfo = server.getIndexInfo();
220 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
221 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
223 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
224 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
227 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
236 ingester = Ingester.getDefault();
237 this.context = context;
254 port = Integer.parseInt(properties.getPort());
255 }
catch (NumberFormatException ex) {
257 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
260 kwsService.
tryConnect(properties.getHost(), port);
267 if (!server.isRunning()) {
268 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
272 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
279 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
283 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
284 boolean hasKeywordsForSearch =
false;
286 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
287 hasKeywordsForSearch =
true;
291 if (!hasKeywordsForSearch) {
293 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
299 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
300 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
301 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
304 stringsExtractionContext = Lookups.fixed(stringsConfig);
312 if (initialized ==
false)
314 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
319 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
324 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
340 if (!startedSearching) {
344 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
345 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
346 startedSearching =
true;
358 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
360 if ((initialized ==
false) || (context == null)) {
365 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
366 IngestSearchRunner.getInstance().stopJob(jobId);
372 IngestSearchRunner.getInstance().endJob(jobId);
378 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
380 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
382 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
385 synchronized (ingestStatus) {
386 ingestStatus.remove(jobId);
397 stringsExtractionContext = null;
405 int text_ingested = 0;
406 int metadata_ingested = 0;
407 int strings_ingested = 0;
412 synchronized (ingestStatus) {
413 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
414 if (ingestStatusForJob == null) {
422 case METADATA_INGESTED:
425 case STRINGS_INGESTED:
428 case SKIPPED_ERROR_TEXTEXTRACT:
431 case SKIPPED_ERROR_INDEXING:
434 case SKIPPED_ERROR_IO:
443 StringBuilder msg =
new StringBuilder();
444 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
445 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
446 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
447 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
448 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
449 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
450 msg.append(
"</table>");
451 String indexStats = msg.toString();
452 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
454 if (error_index > 0) {
456 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
457 }
else if (error_io + error_text > 0) {
458 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
459 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
485 imageConfig.
setOCREnabled(KeywordSearchSettings.getOcrOption());
487 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
495 Map<String, String> metadata = extractor.
getMetadata();
496 CharSource formattedMetadata = getMetaDataCharSource(metadata);
498 finalReader = CharSource.concat(
new CharSource() {
501 public Reader openStream()
throws IOException {
504 }, formattedMetadata).openStream();
505 }
catch (IOException ex) {
506 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
507 aFile.getName(), aFile.getId()), ex);
509 finalReader = fileText;
512 return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
527 "KeywordSearchIngestModule.metadataTitle=METADATA"
530 return CharSource.wrap(
new StringBuilder(
531 String.format(
"\n\n------------------------------%s------------------------------\n\n",
532 Bundle.KeywordSearchIngestModule_metadataTitle()))
533 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
534 .map(entry -> entry.getKey() +
": " + entry.getValue())
535 .collect(Collectors.joining(
"\n"))
553 Reader extractedTextReader = stringsExtractor.
getReader();
558 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
563 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
576 private void indexFile(AbstractFile aFile,
boolean indexContent) {
579 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
588 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
589 || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
590 || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase(
"txt"))) {
594 extractStringsAndIndex(aFile);
598 final long size = aFile.getSize();
601 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
606 ingester.indexMetaDataOnly(aFile);
608 }
catch (IngesterException ex) {
610 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
618 String fileType = fileTypeDetector.
getMIMEType(aFile);
622 if (ARCHIVE_MIME_TYPES.contains(fileType)) {
627 ingester.indexMetaDataOnly(aFile);
629 }
catch (IngesterException ex) {
631 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
636 boolean wasTextAdded =
false;
644 if (fileType.equals(MimeTypes.OCTET_STREAM)) {
645 extractStringsAndIndex(aFile);
648 if (!extractTextAndIndex(aFile)) {
656 }
catch (IngesterException e) {
657 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
658 + aFile.getName(), e);
660 }
catch (Exception e) {
661 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
662 + aFile.getName(), e);
666 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
669 wasTextAdded = indexTextFile(aFile);
673 if (wasTextAdded ==
false) {
674 extractStringsAndIndex(aFile);
687 Reader textReader = textFileExtractor.
getReader();
688 if (textReader == null) {
689 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
690 }
else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
695 }
catch (IngesterException | IOException ex) {
696 logger.log(Level.WARNING,
"Unable to index " + aFile.getName(), ex);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
CharSource getMetaDataCharSource(Map< String, String > metadata)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
boolean indexTextFile(AbstractFile aFile)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Lookup stringsExtractionContext
static void warn(String title, String message)
boolean extractTextAndIndex(AbstractFile aFile)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.