19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.collect.ImmutableSet;
24 import com.google.common.io.CharSource;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.text.ParseException;
28 import java.text.SimpleDateFormat;
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.Date;
32 import java.util.HashMap;
33 import java.util.List;
34 import static java.util.Locale.US;
36 import java.util.Optional;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.stream.Collectors;
40 import org.apache.tika.mime.MimeTypes;
41 import org.openide.util.Lookup;
42 import org.openide.util.NbBundle;
43 import org.openide.util.NbBundle.Messages;
44 import org.openide.util.lookup.Lookups;
83 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
84 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
85 "SolrConnectionCheck.Port=Invalid port number.",
86 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
87 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
88 "CannotRunFileTypeDetection=Unable to run file type detection."
92 private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
98 static final List<String> ARCHIVE_MIME_TYPES
101 "application/x-7z-compressed",
102 "application/x-ace-compressed",
103 "application/x-alz-compressed",
105 "application/vnd.ms-cab-compressed",
106 "application/x-cfs-compressed",
107 "application/x-dgc-compressed",
108 "application/x-apple-diskimage",
109 "application/x-gca-compressed",
113 "application/x-rar-compressed",
114 "application/x-stuffit",
115 "application/x-stuffitx",
116 "application/x-gtar",
117 "application/x-archive",
118 "application/x-executable",
119 "application/x-gzip",
122 "application/x-cpio",
123 "application/x-shar",
125 "application/x-bzip",
126 "application/x-bzip2",
127 "application/x-lzip",
128 "application/x-lzma",
129 "application/x-lzop",
131 "application/x-compress");
133 private static final List<String> METADATA_DATE_TYPES
139 private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
140 .put(
"Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
141 .put(
"Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
142 .put(
"Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
143 .put(
"Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
144 .put(
"Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
145 .put(
"Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
146 .put(
"Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
147 .put(
"Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
148 .put(
"Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
149 .put(
"pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
152 private static final String IMAGE_MIME_TYPE_PREFIX =
"image/";
155 private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
157 "application/msword",
158 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
159 "application/vnd.ms-powerpoint",
160 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
161 "application/vnd.ms-excel",
162 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
168 enum StringsExtractOptions {
175 private Ingester ingester = null;
182 private boolean initialized =
false;
184 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
185 private int instanceNum = 0;
198 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
209 synchronized (ingestStatus) {
210 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
211 if (ingestStatusForJob == null) {
212 ingestStatusForJob =
new HashMap<>();
213 ingestStatus.put(ingestJobId, ingestStatusForJob);
215 ingestStatusForJob.put(fileId, status);
216 ingestStatus.put(ingestJobId, ingestStatusForJob);
221 this.settings = settings;
222 instanceNum = instanceCount.getAndIncrement();
231 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
232 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
233 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
234 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
242 if (settings.isIndexToSolrEnabled()) {
244 if (server.coreIsOpen() ==
false) {
249 Index indexInfo = server.getIndexInfo();
250 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
251 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
254 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
264 ingester = Ingester.getDefault();
265 this.context = context;
282 port = Integer.parseInt(properties.getPort());
283 }
catch (NumberFormatException ex) {
285 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
288 kwsService.
tryConnect(properties.getHost(), port);
295 if (server != null) {
297 if (!server.isLocalSolrRunning()) {
298 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
302 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
309 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
313 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
314 boolean hasKeywordsForSearch =
false;
316 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
317 hasKeywordsForSearch =
true;
322 if (!settings.isIndexToSolrEnabled()) {
324 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.indexingDisabled")));
326 if (!hasKeywordsForSearch) {
328 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
335 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
336 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
337 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
340 stringsExtractionContext = Lookups.fixed(stringsConfig);
347 if (initialized ==
false)
349 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
354 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
360 Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
362 String mimeType = fileTypeDetector.
getMIMEType(abstractFile).trim().toLowerCase();
366 if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
372 if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
373 && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
378 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
383 searchFile(extractorOpt, abstractFile, mimeType,
false);
391 searchFile(extractorOpt, abstractFile, mimeType,
true);
402 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
404 if ((initialized ==
false) || (context == null)) {
409 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping due to ingest cancellation", instanceNum);
418 InlineSearcher.makeArtifacts(context);
419 InlineSearcher.cleanup(context);
420 Ingester.getDefault().commit();
421 }
catch (TskException ex) {
422 logger.log(Level.SEVERE, String.format(
"Failed to create search ingest artifacts for job %d", context.
getJobId()), ex);
427 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
429 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
431 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
434 synchronized (ingestStatus) {
435 ingestStatus.remove(jobId);
446 stringsExtractionContext = null;
461 if (OCR_DOCUMENTS.contains(mimeType)) {
465 if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
466 return aFile.getSize() > LIMITED_OCR_SIZE_MIN
467 || aFile.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
477 int text_ingested = 0;
478 int metadata_ingested = 0;
479 int strings_ingested = 0;
484 synchronized (ingestStatus) {
485 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
486 if (ingestStatusForJob == null) {
494 case METADATA_INGESTED:
497 case STRINGS_INGESTED:
500 case SKIPPED_ERROR_TEXTEXTRACT:
503 case SKIPPED_ERROR_INDEXING:
506 case SKIPPED_ERROR_IO:
515 StringBuilder msg =
new StringBuilder();
516 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
517 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
518 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
519 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
520 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
521 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
522 msg.append(
"</table>");
523 String indexStats = msg.toString();
524 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
526 if (error_index > 0) {
528 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
529 }
else if (error_io + error_text > 0) {
530 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
531 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
535 private Optional<TextExtractor>
getExtractor(AbstractFile abstractFile) {
539 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
543 return Optional.empty();
567 Map<String, String> extractedMetadata)
throws IngesterException {
570 if (!extractorOptional.isPresent()) {
574 Ingester.getDefault().search(getTikaOrTextExtractor(extractorOptional, aFile, extractedMetadata), aFile.getId(), aFile.getName(), aFile, context,
true,settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
578 }
catch(Exception ex) {
579 logger.log(Level.WARNING, String.format(
"Failed to search file %s [id=%d]",
580 aFile.getName(), aFile.getId()), ex);
594 Map<String, String> metadata = extractor.
getMetadata();
595 if (!metadata.isEmpty()) {
599 extractedMetadata.putAll(metadata);
601 CharSource formattedMetadata = getMetaDataCharSource(metadata);
603 finalReader = CharSource.concat(
new CharSource() {
606 public Reader openStream()
throws IOException {
609 }, formattedMetadata).openStream();
610 }
catch (IOException ex) {
611 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
612 aFile.getName(), aFile.getId()), ex);
614 finalReader = fileText;
625 Collection<BlackboardAttribute> attributes =
new ArrayList<>();
626 Collection<BlackboardArtifact> bbartifacts =
new ArrayList<>();
627 for (Map.Entry<String, String> entry : metadata.entrySet()) {
628 if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
629 BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
635 if (!attributes.isEmpty()) {
637 BlackboardArtifact bbart = aFile.newDataArtifact(
new BlackboardArtifact.Type(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA), attributes);
638 bbartifacts.add(bbart);
639 }
catch (TskCoreException ex) {
641 logger.log(Level.WARNING, String.format(
"Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex);
644 if (!bbartifacts.isEmpty()) {
649 logger.log(Level.WARNING, String.format(
"Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()), ex);
658 if (!value.isEmpty() && value.charAt(0) !=
' ') {
659 if (METADATA_DATE_TYPES.contains(key)) {
660 SimpleDateFormat metadataDateFormat =
new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss", US);
661 Long metadataDateTime = Long.valueOf(0);
663 String metadataDate = value.replaceAll(
"T",
" ").replaceAll(
"Z",
"");
664 Date usedDate = metadataDateFormat.parse(metadataDate);
665 metadataDateTime = usedDate.getTime() / 1000;
666 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
667 }
catch (ParseException ex) {
669 logger.log(Level.WARNING, String.format(
"Failed to parse date/time %s for metadata attribute %s.", value, key), ex);
673 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
689 "KeywordSearchIngestModule.metadataTitle=METADATA"
691 static CharSource getMetaDataCharSource(Map<String, String> metadata) {
692 return CharSource.wrap(
new StringBuilder(
693 String.format(
"\n\n------------------------------%s------------------------------\n\n",
694 Bundle.KeywordSearchIngestModule_metadataTitle()))
695 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
696 .map(entry -> entry.getKey() +
": " + entry.getValue())
697 .collect(Collectors.joining(
"\n"))
713 Reader extractedTextReader = KeywordSearchUtil.getReader(aFile, stringsExtractionContext);
714 Ingester.getDefault().search(extractedTextReader, aFile.getId(), aFile.getName(), aFile,
KeywordSearchIngestModule.this.
context,
false, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
716 }
catch (Exception ex) {
717 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
734 private void searchFile(Optional<TextExtractor> extractor, AbstractFile aFile, String mimeType,
boolean indexContent) {
737 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
745 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
746 || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
747 || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase(
"txt"))) {
751 extractStringsAndIndex(aFile);
755 final long size = aFile.getSize();
758 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
763 ingester.indexMetaDataOnly(aFile);
765 }
catch (IngesterException ex) {
767 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
778 if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
783 ingester.indexMetaDataOnly(aFile);
785 }
catch (IngesterException ex) {
787 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
792 boolean wasTextAdded =
false;
793 Map<String, String> extractedMetadata =
new HashMap<>();
801 if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
802 extractStringsAndIndex(aFile);
805 if (!extractTextAndSearch(extractor, aFile, extractedMetadata)) {
813 }
catch (IngesterException e) {
814 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
815 + aFile.getName(), e);
817 }
catch (Exception e) {
818 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
819 + aFile.getName(), e);
823 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
826 wasTextAdded = searchTextFile(aFile);
830 if (wasTextAdded ==
false) {
831 extractStringsAndIndex(aFile);
837 if (!extractedMetadata.isEmpty()) {
838 createMetadataArtifact(aFile, extractedMetadata);
851 Reader textReader = textFileExtractor.
getReader();
852 if (textReader == null) {
853 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
855 Ingester.getDefault().search(textReader, aFile.getId(), aFile.getName(), aFile, context,
true, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
860 }
catch (Exception ex) {
861 logger.log(Level.WARNING,
"Unable to index " + aFile.getName(), ex);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
Reader getTikaOrTextExtractor(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean searchTextFile(AbstractFile aFile)
void startUp(IngestJobContext context)
static synchronized Server getServer()
void searchFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
synchronized long incrementAndGet(long jobId)
BlackboardAttribute checkAttribute(String key, String value)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractTextAndSearch(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
boolean isLimitedOCRFile(AbstractFile aFile, String mimeType)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
SleuthkitCase getSleuthkitCase()
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
Lookup stringsExtractionContext
static void warn(String title, String message)
boolean extractStringsAndIndex(AbstractFile aFile)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.