19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.collect.ImmutableSet;
24 import com.google.common.io.CharSource;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.text.ParseException;
28 import java.text.SimpleDateFormat;
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.Date;
32 import java.util.HashMap;
33 import java.util.List;
34 import static java.util.Locale.US;
36 import java.util.Optional;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.stream.Collectors;
40 import org.apache.tika.mime.MimeTypes;
41 import org.openide.util.Lookup;
42 import org.openide.util.NbBundle;
43 import org.openide.util.NbBundle.Messages;
44 import org.openide.util.lookup.Lookups;
82 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
83 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
84 "SolrConnectionCheck.Port=Invalid port number.",
85 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
86 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
87 "CannotRunFileTypeDetection=Unable to run file type detection."
91 private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
97 private static final List<String> ARCHIVE_MIME_TYPES
100 "application/x-7z-compressed",
101 "application/x-ace-compressed",
102 "application/x-alz-compressed",
104 "application/vnd.ms-cab-compressed",
105 "application/x-cfs-compressed",
106 "application/x-dgc-compressed",
107 "application/x-apple-diskimage",
108 "application/x-gca-compressed",
112 "application/x-rar-compressed",
113 "application/x-stuffit",
114 "application/x-stuffitx",
115 "application/x-gtar",
116 "application/x-archive",
117 "application/x-executable",
118 "application/x-gzip",
121 "application/x-cpio",
122 "application/x-shar",
124 "application/x-bzip",
125 "application/x-bzip2",
126 "application/x-lzip",
127 "application/x-lzma",
128 "application/x-lzop",
130 "application/x-compress");
132 private static final List<String> METADATA_DATE_TYPES
138 private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
139 .put(
"Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
140 .put(
"Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
141 .put(
"Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
142 .put(
"Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
143 .put(
"Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
144 .put(
"Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
145 .put(
"Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
146 .put(
"Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
147 .put(
"Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
148 .put(
"pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
151 private static final String IMAGE_MIME_TYPE_PREFIX =
"image/";
154 private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
156 "application/msword",
157 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
158 "application/vnd.ms-powerpoint",
159 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
160 "application/vnd.ms-excel",
161 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
167 enum StringsExtractOptions {
172 enum UpdateFrequency {
178 NONE(Integer.MAX_VALUE),
180 private final int time;
182 UpdateFrequency(
int time) {
192 private Ingester ingester = null;
198 private boolean startedSearching =
false;
201 private boolean initialized =
false;
203 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
204 private int instanceNum = 0;
217 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
228 synchronized (ingestStatus) {
229 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
230 if (ingestStatusForJob == null) {
231 ingestStatusForJob =
new HashMap<>();
232 ingestStatus.put(ingestJobId, ingestStatusForJob);
234 ingestStatusForJob.put(fileId, status);
235 ingestStatus.put(ingestJobId, ingestStatusForJob);
240 this.settings = settings;
241 instanceNum = instanceCount.getAndIncrement();
250 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
251 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
252 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
253 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
261 if (server.coreIsOpen() ==
false) {
266 Index indexInfo = server.getIndexInfo();
267 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
268 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
271 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
280 ingester = Ingester.getDefault();
281 this.context = context;
298 port = Integer.parseInt(properties.getPort());
299 }
catch (NumberFormatException ex) {
301 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
304 kwsService.
tryConnect(properties.getHost(), port);
311 if (!server.isLocalSolrRunning()) {
312 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
316 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
323 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
327 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
328 boolean hasKeywordsForSearch =
false;
330 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
331 hasKeywordsForSearch =
true;
335 if (!hasKeywordsForSearch) {
337 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
343 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
344 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
345 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
348 stringsExtractionContext = Lookups.fixed(stringsConfig);
356 if (initialized ==
false)
358 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
363 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
369 Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
371 String mimeType = fileTypeDetector.
getMIMEType(abstractFile).trim().toLowerCase();
375 if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
381 if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
382 && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
387 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
392 indexer.
indexFile(extractorOpt, abstractFile, mimeType,
false);
400 indexer.
indexFile(extractorOpt, abstractFile, mimeType,
true);
403 if (!startedSearching) {
407 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
408 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
409 startedSearching =
true;
421 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
423 if ((initialized ==
false) || (context == null)) {
428 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
429 IngestSearchRunner.getInstance().stopJob(jobId);
435 IngestSearchRunner.getInstance().endJob(jobId);
441 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
443 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
445 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
448 synchronized (ingestStatus) {
449 ingestStatus.remove(jobId);
460 stringsExtractionContext = null;
475 if (OCR_DOCUMENTS.contains(mimeType)) {
479 if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
480 return aFile.getSize() > LIMITED_OCR_SIZE_MIN
481 || aFile.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
491 int text_ingested = 0;
492 int metadata_ingested = 0;
493 int strings_ingested = 0;
498 synchronized (ingestStatus) {
499 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
500 if (ingestStatusForJob == null) {
508 case METADATA_INGESTED:
511 case STRINGS_INGESTED:
514 case SKIPPED_ERROR_TEXTEXTRACT:
517 case SKIPPED_ERROR_INDEXING:
520 case SKIPPED_ERROR_IO:
529 StringBuilder msg =
new StringBuilder();
530 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
531 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
532 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
533 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
534 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
535 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
536 msg.append(
"</table>");
537 String indexStats = msg.toString();
538 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
540 if (error_index > 0) {
542 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
543 }
else if (error_io + error_text > 0) {
544 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
545 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
549 private Optional<TextExtractor>
getExtractor(AbstractFile abstractFile) {
553 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
557 return Optional.empty();
586 Map<String, String> extractedMetadata)
throws IngesterException {
589 if (!extractorOptional.isPresent()) {
596 Map<String, String> metadata = extractor.
getMetadata();
597 if (!metadata.isEmpty()) {
601 extractedMetadata.putAll(metadata);
603 CharSource formattedMetadata = getMetaDataCharSource(metadata);
605 finalReader = CharSource.concat(
new CharSource() {
608 public Reader openStream()
throws IOException {
611 }, formattedMetadata).openStream();
612 }
catch (IOException ex) {
613 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
614 aFile.getName(), aFile.getId()), ex);
616 finalReader = fileText;
619 return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
630 Collection<BlackboardAttribute> attributes =
new ArrayList<>();
631 Collection<BlackboardArtifact> bbartifacts =
new ArrayList<>();
632 for (Map.Entry<String, String> entry : metadata.entrySet()) {
633 if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
634 BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
640 if (!attributes.isEmpty()) {
642 BlackboardArtifact bbart = aFile.newDataArtifact(
new BlackboardArtifact.Type(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA), attributes);
643 bbartifacts.add(bbart);
644 }
catch (TskCoreException ex) {
646 logger.log(Level.WARNING, String.format(
"Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex);
649 if (!bbartifacts.isEmpty()) {
654 logger.log(Level.WARNING, String.format(
"Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()), ex);
663 if (!value.isEmpty() && value.charAt(0) !=
' ') {
664 if (METADATA_DATE_TYPES.contains(key)) {
665 SimpleDateFormat metadataDateFormat =
new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss", US);
666 Long metadataDateTime = Long.valueOf(0);
668 String metadataDate = value.replaceAll(
"T",
" ").replaceAll(
"Z",
"");
669 Date usedDate = metadataDateFormat.parse(metadataDate);
670 metadataDateTime = usedDate.getTime() / 1000;
671 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
672 }
catch (ParseException ex) {
674 logger.log(Level.WARNING, String.format(
"Failed to parse date/time %s for metadata attribute %s.", value, key), ex);
678 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
694 "KeywordSearchIngestModule.metadataTitle=METADATA"
697 return CharSource.wrap(
new StringBuilder(
698 String.format(
"\n\n------------------------------%s------------------------------\n\n",
699 Bundle.KeywordSearchIngestModule_metadataTitle()))
700 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
701 .map(entry -> entry.getKey() +
": " + entry.getValue())
702 .collect(Collectors.joining(
"\n"))
720 Reader extractedTextReader = stringsExtractor.
getReader();
725 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
730 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
746 private void indexFile(Optional<TextExtractor> extractor, AbstractFile aFile, String mimeType,
boolean indexContent) {
749 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
758 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
759 || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
760 || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase(
"txt"))) {
764 extractStringsAndIndex(aFile);
768 final long size = aFile.getSize();
771 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
776 ingester.indexMetaDataOnly(aFile);
778 }
catch (IngesterException ex) {
780 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
791 if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
796 ingester.indexMetaDataOnly(aFile);
798 }
catch (IngesterException ex) {
800 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
805 boolean wasTextAdded =
false;
806 Map<String, String> extractedMetadata =
new HashMap<>();
814 if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
815 extractStringsAndIndex(aFile);
818 if (!extractTextAndIndex(extractor, aFile, extractedMetadata)) {
826 }
catch (IngesterException e) {
827 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
828 + aFile.getName(), e);
830 }
catch (Exception e) {
831 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
832 + aFile.getName(), e);
836 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
839 wasTextAdded = indexTextFile(aFile);
843 if (wasTextAdded ==
false) {
844 extractStringsAndIndex(aFile);
850 if (!extractedMetadata.isEmpty()) {
851 createMetadataArtifact(aFile, extractedMetadata);
864 Reader textReader = textFileExtractor.
getReader();
865 if (textReader == null) {
866 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
867 }
else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
873 logger.log(Level.WARNING,
"Unable to index " + aFile.getName(), ex);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
CharSource getMetaDataCharSource(Map< String, String > metadata)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
boolean extractTextAndIndex(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
void indexFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
boolean indexTextFile(AbstractFile aFile)
boolean isLimitedOCRFile(AbstractFile aFile, String mimeType)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
SleuthkitCase getSleuthkitCase()
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
BlackboardAttribute checkAttribute(String key, String value)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
Lookup stringsExtractionContext
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.