19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.io.CharSource;
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.text.ParseException;
27 import java.text.SimpleDateFormat;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.List;
33 import static java.util.Locale.US;
35 import java.util.concurrent.atomic.AtomicInteger;
36 import java.util.logging.Level;
37 import java.util.stream.Collectors;
38 import org.apache.tika.mime.MimeTypes;
39 import org.openide.util.Lookup;
40 import org.openide.util.NbBundle;
41 import org.openide.util.NbBundle.Messages;
42 import org.openide.util.lookup.Lookups;
80 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
81 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
82 "SolrConnectionCheck.Port=Invalid port number.",
83 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
84 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
85 "CannotRunFileTypeDetection=Unable to run file type detection."
93 private static final List<String> ARCHIVE_MIME_TYPES
96 "application/x-7z-compressed",
97 "application/x-ace-compressed",
98 "application/x-alz-compressed",
100 "application/vnd.ms-cab-compressed",
101 "application/x-cfs-compressed",
102 "application/x-dgc-compressed",
103 "application/x-apple-diskimage",
104 "application/x-gca-compressed",
108 "application/x-rar-compressed",
109 "application/x-stuffit",
110 "application/x-stuffitx",
111 "application/x-gtar",
112 "application/x-archive",
113 "application/x-executable",
114 "application/x-gzip",
117 "application/x-cpio",
118 "application/x-shar",
120 "application/x-bzip",
121 "application/x-bzip2",
122 "application/x-lzip",
123 "application/x-lzma",
124 "application/x-lzop",
126 "application/x-compress");
128 private static final List<String> METADATA_DATE_TYPES
134 private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
135 .put(
"Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
136 .put(
"Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
137 .put(
"Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
138 .put(
"Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
139 .put(
"Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
140 .put(
"Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
141 .put(
"Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
142 .put(
"Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
143 .put(
"Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
144 .put(
"pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
151 enum StringsExtractOptions {
156 enum UpdateFrequency {
162 NONE(Integer.MAX_VALUE),
164 private final int time;
166 UpdateFrequency(
int time) {
176 private Ingester ingester = null;
182 private boolean startedSearching =
false;
185 private boolean initialized =
false;
187 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
188 private int instanceNum = 0;
201 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
212 synchronized (ingestStatus) {
213 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
214 if (ingestStatusForJob == null) {
215 ingestStatusForJob =
new HashMap<>();
216 ingestStatus.put(ingestJobId, ingestStatusForJob);
218 ingestStatusForJob.put(fileId, status);
219 ingestStatus.put(ingestJobId, ingestStatusForJob);
224 this.settings = settings;
225 instanceNum = instanceCount.getAndIncrement();
234 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
235 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
236 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
237 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
245 if (server.coreIsOpen() ==
false) {
250 Index indexInfo = server.getIndexInfo();
251 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
252 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
254 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
255 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
258 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
267 ingester = Ingester.getDefault();
268 this.context = context;
285 port = Integer.parseInt(properties.getPort());
286 }
catch (NumberFormatException ex) {
288 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
291 kwsService.
tryConnect(properties.getHost(), port);
298 if (!server.isRunning()) {
299 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
303 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
310 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
314 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
315 boolean hasKeywordsForSearch =
false;
317 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
318 hasKeywordsForSearch =
true;
322 if (!hasKeywordsForSearch) {
324 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
330 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
331 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
332 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
335 stringsExtractionContext = Lookups.fixed(stringsConfig);
343 if (initialized ==
false)
345 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
350 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
355 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
371 if (!startedSearching) {
375 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
376 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
377 startedSearching =
true;
389 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
391 if ((initialized ==
false) || (context == null)) {
396 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
397 IngestSearchRunner.getInstance().stopJob(jobId);
403 IngestSearchRunner.getInstance().endJob(jobId);
409 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
411 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
413 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
416 synchronized (ingestStatus) {
417 ingestStatus.remove(jobId);
428 stringsExtractionContext = null;
436 int text_ingested = 0;
437 int metadata_ingested = 0;
438 int strings_ingested = 0;
443 synchronized (ingestStatus) {
444 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
445 if (ingestStatusForJob == null) {
453 case METADATA_INGESTED:
456 case STRINGS_INGESTED:
459 case SKIPPED_ERROR_TEXTEXTRACT:
462 case SKIPPED_ERROR_INDEXING:
465 case SKIPPED_ERROR_IO:
474 StringBuilder msg =
new StringBuilder();
475 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
476 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
477 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
478 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
479 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
480 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
481 msg.append(
"</table>");
482 String indexStats = msg.toString();
483 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
485 if (error_index > 0) {
487 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
488 }
else if (error_io + error_text > 0) {
489 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
490 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
515 private boolean extractTextAndIndex(AbstractFile aFile, Map<String, String> extractedMetadata)
throws IngesterException {
517 imageConfig.
setOCREnabled(KeywordSearchSettings.getOcrOption());
519 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
527 Map<String, String> metadata = extractor.
getMetadata();
528 if (!metadata.isEmpty()) {
532 extractedMetadata.putAll(metadata);
534 CharSource formattedMetadata = getMetaDataCharSource(metadata);
536 finalReader = CharSource.concat(
new CharSource() {
539 public Reader openStream()
throws IOException {
542 }, formattedMetadata).openStream();
543 }
catch (IOException ex) {
544 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
545 aFile.getName(), aFile.getId()), ex);
547 finalReader = fileText;
550 return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
561 Collection<BlackboardAttribute> attributes =
new ArrayList<>();
562 Collection<BlackboardArtifact> bbartifacts =
new ArrayList<>();
563 for (Map.Entry<String, String> entry : metadata.entrySet()) {
564 if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
565 BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
571 if (!attributes.isEmpty()) {
573 BlackboardArtifact bbart = aFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA);
574 bbart.addAttributes(attributes);
575 bbartifacts.add(bbart);
576 }
catch (TskCoreException ex) {
578 logger.log(Level.WARNING, String.format(
"Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex);
581 if (!bbartifacts.isEmpty()) {
586 logger.log(Level.WARNING, String.format(
"Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()) , ex);
596 if (!value.isEmpty() && value.charAt(0) !=
' ') {
597 if (METADATA_DATE_TYPES.contains(key)) {
598 SimpleDateFormat metadataDateFormat =
new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss", US);
599 Long metadataDateTime = Long.valueOf(0);
601 String metadataDate = value.replaceAll(
"T",
" ").replaceAll(
"Z",
"");
602 Date usedDate = metadataDateFormat.parse(metadataDate);
603 metadataDateTime = usedDate.getTime()/1000;
604 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
605 }
catch (ParseException ex) {
607 logger.log(Level.WARNING, String.format(
"Failed to parse date/time %s for metadata attribute %s.", value, key), ex);
611 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
628 "KeywordSearchIngestModule.metadataTitle=METADATA"
631 return CharSource.wrap(
new StringBuilder(
632 String.format(
"\n\n------------------------------%s------------------------------\n\n",
633 Bundle.KeywordSearchIngestModule_metadataTitle()))
634 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
635 .map(entry -> entry.getKey() +
": " + entry.getValue())
636 .collect(Collectors.joining(
"\n"))
654 Reader extractedTextReader = stringsExtractor.
getReader();
659 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
664 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
677 private void indexFile(AbstractFile aFile,
boolean indexContent) {
680 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
689 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
690 || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
691 || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase(
"txt"))) {
695 extractStringsAndIndex(aFile);
699 final long size = aFile.getSize();
702 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
707 ingester.indexMetaDataOnly(aFile);
709 }
catch (IngesterException ex) {
711 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
719 String fileType = fileTypeDetector.
getMIMEType(aFile);
723 if (ARCHIVE_MIME_TYPES.contains(fileType)) {
728 ingester.indexMetaDataOnly(aFile);
730 }
catch (IngesterException ex) {
732 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
737 boolean wasTextAdded =
false;
738 Map<String, String> extractedMetadata =
new HashMap<>();
746 if (fileType.equals(MimeTypes.OCTET_STREAM)) {
747 extractStringsAndIndex(aFile);
750 if (!extractTextAndIndex(aFile, extractedMetadata)) {
758 }
catch (IngesterException e) {
759 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
760 + aFile.getName(), e);
762 }
catch (Exception e) {
763 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
764 + aFile.getName(), e);
768 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
771 wasTextAdded = indexTextFile(aFile);
775 if (wasTextAdded ==
false) {
776 extractStringsAndIndex(aFile);
782 if (!extractedMetadata.isEmpty()) {
783 createMetadataArtifact(aFile, extractedMetadata);
796 Reader textReader = textFileExtractor.
getReader();
797 if (textReader == null) {
798 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
799 }
else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
805 logger.log(Level.WARNING,
"Unable to index " + aFile.getName(), ex);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
CharSource getMetaDataCharSource(Map< String, String > metadata)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean extractTextAndIndex(AbstractFile aFile, Map< String, String > extractedMetadata)
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
boolean indexTextFile(AbstractFile aFile)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
SleuthkitCase getSleuthkitCase()
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
BlackboardAttribute checkAttribute(String key, String value)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Lookup stringsExtractionContext
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.