19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import java.io.IOException;
24 import java.io.Reader;
25 import java.util.HashMap;
26 import java.util.List;
28 import java.util.concurrent.atomic.AtomicInteger;
29 import java.util.logging.Level;
30 import java.util.stream.Collectors;
31 import org.openide.util.Lookup;
32 import org.openide.util.NbBundle;
33 import org.openide.util.NbBundle.Messages;
34 import org.openide.util.lookup.Lookups;
68 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
69 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
70 "SolrConnectionCheck.Port=Invalid port number.",
71 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
72 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
73 "CannotRunFileTypeDetection=Unable to run file type detection."
81 private static final List<String> ARCHIVE_MIME_TYPES
84 "application/x-7z-compressed",
85 "application/x-ace-compressed",
86 "application/x-alz-compressed",
88 "application/vnd.ms-cab-compressed",
89 "application/x-cfs-compressed",
90 "application/x-dgc-compressed",
91 "application/x-apple-diskimage",
92 "application/x-gca-compressed",
96 "application/x-rar-compressed",
97 "application/x-stuffit",
98 "application/x-stuffitx",
100 "application/x-archive",
101 "application/x-executable",
102 "application/x-gzip",
105 "application/x-cpio",
106 "application/x-shar",
108 "application/x-bzip",
109 "application/x-bzip2",
110 "application/x-lzip",
111 "application/x-lzma",
112 "application/x-lzop",
114 "application/x-compress");
119 enum StringsExtractOptions {
124 enum UpdateFrequency {
130 NONE(Integer.MAX_VALUE),
132 private final int time;
134 UpdateFrequency(
int time) {
144 private Ingester ingester = null;
150 private boolean startedSearching =
false;
153 private boolean initialized =
false;
155 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
156 private int instanceNum = 0;
169 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
180 synchronized (ingestStatus) {
181 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
182 if (ingestStatusForJob == null) {
183 ingestStatusForJob =
new HashMap<>();
184 ingestStatus.put(ingestJobId, ingestStatusForJob);
186 ingestStatusForJob.put(fileId, status);
187 ingestStatus.put(ingestJobId, ingestStatusForJob);
192 this.settings = settings;
193 instanceNum = instanceCount.getAndIncrement();
202 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
203 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
204 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
205 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
213 if (server.coreIsOpen() ==
false) {
218 Index indexInfo = server.getIndexInfo();
219 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
220 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
222 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
223 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
226 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
235 ingester = Ingester.getDefault();
236 this.context = context;
253 port = Integer.parseInt(properties.getPort());
254 }
catch (NumberFormatException ex) {
256 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
259 kwsService.
tryConnect(properties.getHost(), port);
266 if (!server.isRunning()) {
267 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
271 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
278 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
282 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
283 boolean hasKeywordsForSearch =
false;
285 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
286 hasKeywordsForSearch =
true;
290 if (!hasKeywordsForSearch) {
292 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
298 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
299 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
300 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
303 stringsExtractionContext = Lookups.fixed(stringsConfig);
311 if (initialized ==
false)
313 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
318 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
323 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
339 if (!startedSearching) {
343 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
344 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
345 startedSearching =
true;
357 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
359 if ((initialized ==
false) || (context == null)) {
364 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
365 IngestSearchRunner.getInstance().stopJob(jobId);
371 IngestSearchRunner.getInstance().endJob(jobId);
377 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
379 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
381 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
384 synchronized (ingestStatus) {
385 ingestStatus.remove(jobId);
396 stringsExtractionContext = null;
404 int text_ingested = 0;
405 int metadata_ingested = 0;
406 int strings_ingested = 0;
411 synchronized (ingestStatus) {
412 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
413 if (ingestStatusForJob == null) {
421 case METADATA_INGESTED:
424 case STRINGS_INGESTED:
427 case SKIPPED_ERROR_TEXTEXTRACT:
430 case SKIPPED_ERROR_INDEXING:
433 case SKIPPED_ERROR_IO:
442 StringBuilder msg =
new StringBuilder();
443 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
444 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
445 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
446 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
447 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
448 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
449 msg.append(
"</table>");
450 String indexStats = msg.toString();
451 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
453 if (error_index > 0) {
455 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
456 }
else if (error_io + error_text > 0) {
457 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
458 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
484 imageConfig.
setOCREnabled(KeywordSearchSettings.getOcrOption());
486 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
494 Map<String, String> metadata = extractor.
getMetadata();
495 CharSource formattedMetadata = getMetaDataCharSource(metadata);
497 finalReader = CharSource.concat(
new CharSource() {
500 public Reader openStream()
throws IOException {
503 }, formattedMetadata).openStream();
504 }
catch (IOException ex) {
505 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
506 aFile.getName(), aFile.getId()), ex);
508 finalReader = fileText;
511 return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
526 "KeywordSearchIngestModule.metadataTitle=METADATA"
529 return CharSource.wrap(
new StringBuilder(
530 String.format(
"\n\n------------------------------%s------------------------------\n\n",
531 Bundle.KeywordSearchIngestModule_metadataTitle()))
532 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
533 .map(entry -> entry.getKey() +
": " + entry.getValue())
534 .collect(Collectors.joining(
"\n"))
552 Reader extractedTextReader = stringsExtractor.
getReader();
557 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
562 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
575 private void indexFile(AbstractFile aFile,
boolean indexContent) {
578 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
581 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
585 extractStringsAndIndex(aFile);
589 final long size = aFile.getSize();
592 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
597 ingester.indexMetaDataOnly(aFile);
599 }
catch (IngesterException ex) {
601 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
609 String fileType = fileTypeDetector.
getMIMEType(aFile);
613 if (ARCHIVE_MIME_TYPES.contains(fileType)) {
618 ingester.indexMetaDataOnly(aFile);
620 }
catch (IngesterException ex) {
622 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
627 boolean wasTextAdded =
false;
635 if (fileType.equals(
"application/octet-stream")) {
636 extractStringsAndIndex(aFile);
639 if (!extractTextAndIndex(aFile)) {
647 }
catch (IngesterException e) {
648 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
649 + aFile.getName(), e);
651 }
catch (Exception e) {
652 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
653 + aFile.getName(), e);
657 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
661 TextFileExtractor textFileExtractor =
new TextFileExtractor();
662 Reader textReader = textFileExtractor.getReader(aFile);
663 if (textReader == null) {
664 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
665 }
else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
669 }
catch (IngesterException ex) {
670 logger.log(Level.WARNING,
"Unable to index as unicode", ex);
672 logger.log(Level.INFO,
"Could not extract text with TextFileExtractor", ex);
677 if (wasTextAdded ==
false) {
678 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
CharSource getMetaDataCharSource(Map< String, String > metadata)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Lookup stringsExtractionContext
static void warn(String title, String message)
boolean extractTextAndIndex(AbstractFile aFile)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.