19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableSet;
23 import com.google.common.io.CharSource;
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.text.ParseException;
27 import java.text.SimpleDateFormat;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.List;
33 import static java.util.Locale.US;
35 import java.util.Map.Entry;
36 import java.util.Optional;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.stream.Collectors;
40 import java.util.stream.IntStream;
41 import java.util.stream.Stream;
42 import org.apache.commons.lang3.tuple.Pair;
43 import org.apache.commons.lang3.tuple.Triple;
44 import org.apache.tika.metadata.DublinCore;
45 import org.apache.tika.metadata.FileSystem;
46 import org.apache.tika.metadata.IPTC;
47 import org.apache.tika.metadata.Office;
48 import org.apache.tika.metadata.OfficeOpenXMLCore;
49 import org.apache.tika.metadata.OfficeOpenXMLExtended;
50 import org.apache.tika.metadata.PDF;
51 import org.apache.tika.metadata.Photoshop;
52 import org.apache.tika.metadata.TikaCoreProperties;
53 import org.apache.tika.metadata.XMP;
54 import org.apache.tika.metadata.XMPDM;
55 import org.apache.tika.mime.MimeTypes;
56 import org.openide.util.Lookup;
57 import org.openide.util.NbBundle;
58 import org.openide.util.NbBundle.Messages;
59 import org.openide.util.lookup.Lookups;
98 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
99 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
100 "SolrConnectionCheck.Port=Invalid port number.",
101 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
102 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
103 "CannotRunFileTypeDetection=Unable to run file type detection."
107 private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
113 static final List<String> ARCHIVE_MIME_TYPES
116 "application/x-7z-compressed",
117 "application/x-ace-compressed",
118 "application/x-alz-compressed",
120 "application/vnd.ms-cab-compressed",
121 "application/x-cfs-compressed",
122 "application/x-dgc-compressed",
123 "application/x-apple-diskimage",
124 "application/x-gca-compressed",
128 "application/x-rar-compressed",
129 "application/x-stuffit",
130 "application/x-stuffitx",
131 "application/x-gtar",
132 "application/x-archive",
133 "application/x-executable",
134 "application/x-gzip",
137 "application/x-cpio",
138 "application/x-shar",
140 "application/x-bzip",
141 "application/x-bzip2",
142 "application/x-lzip",
143 "application/x-lzma",
144 "application/x-lzop",
146 "application/x-compress");
156 TikaCoreProperties.MODIFIED.getName(),
157 FileSystem.MODIFIED.getName(),
158 DublinCore.MODIFIED.getName(),
159 PDF.DOC_INFO_MODIFICATION_DATE.getName(),
160 PDF.PDFVT_MODIFIED.getName(),
161 XMP.MODIFY_DATE.getName(),
162 XMPDM.AUDIO_MOD_DATE.getName(),
163 XMPDM.METADATA_MOD_DATE.getName(),
164 XMPDM.VIDEO_MOD_DATE.getName())),
167 Office.LAST_AUTHOR.getName(),
168 TikaCoreProperties.MODIFIER.getName())),
171 TikaCoreProperties.CREATED.getName(),
172 FileSystem.CREATED.getName(),
173 DublinCore.CREATED.getName(),
174 IPTC.DATE_CREATED.getName(),
175 Office.CREATION_DATE.getName(),
176 PDF.DOC_INFO_CREATED.getName(),
177 Photoshop.DATE_CREATED.getName(),
178 XMP.CREATE_DATE.getName())),
181 DublinCore.PUBLISHER.getName(),
182 IPTC.ORGANISATION_NAME.getName(),
183 OfficeOpenXMLExtended.COMPANY.getName())),
186 TikaCoreProperties.CREATOR.getName(),
187 DublinCore.CREATOR.getName(),
188 Office.INITIAL_AUTHOR.getName(),
189 Office.AUTHOR.getName(),
190 Photoshop.AUTHORS_POSITION.getName(),
191 PDF.DOC_INFO_CREATOR.getName())),
195 OfficeOpenXMLExtended.APPLICATION.getName(),
196 org.apache.tika.metadata.RTFMetadata.EMB_APP_VERSION.getName())),
199 OfficeOpenXMLCore.LAST_PRINTED.getName())),
202 DublinCore.TITLE.getName(),
203 IPTC.TITLE.getName(),
204 PDF.DOC_INFO_TITLE.getName())),
206 PDF.PDF_VERSION.getName(),
207 OfficeOpenXMLCore.VERSION.getName())))
210 List<String> keys = pr.getValue();
211 return IntStream.range(0, keys.size())
212 .mapToObj(idx -> Triple.of(keys.get(idx), attrType, idx));
214 .collect(Collectors.toMap(Triple::getLeft, trip -> Pair.of(trip.getMiddle(), trip.getRight()), (v1, v2) -> v1.getRight() < v2.getRight() ? v1 : v2));
217 private static final String IMAGE_MIME_TYPE_PREFIX =
"image/";
220 private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
222 "application/msword",
223 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
224 "application/vnd.ms-powerpoint",
225 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
226 "application/vnd.ms-excel",
227 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
233 enum StringsExtractOptions {
240 private Ingester ingester = null;
247 private boolean initialized =
false;
249 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
250 private int instanceNum = 0;
263 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
274 synchronized (ingestStatus) {
275 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
276 if (ingestStatusForJob == null) {
277 ingestStatusForJob =
new HashMap<>();
278 ingestStatus.put(ingestJobId, ingestStatusForJob);
280 ingestStatusForJob.put(fileId, status);
281 ingestStatus.put(ingestJobId, ingestStatusForJob);
286 this.settings = settings;
287 instanceNum = instanceCount.getAndIncrement();
296 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
297 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
298 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
299 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
307 if (settings.isIndexToSolrEnabled()) {
309 if (server.coreIsOpen() ==
false) {
314 Index indexInfo = server.getIndexInfo();
315 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
316 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
319 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
329 ingester = Ingester.getDefault();
330 this.context = context;
347 port = Integer.parseInt(properties.getPort());
348 }
catch (NumberFormatException ex) {
350 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
353 kwsService.
tryConnect(properties.getHost(), port);
360 if (server != null) {
362 if (!server.isLocalSolrRunning()) {
363 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
367 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
374 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
378 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
379 boolean hasKeywordsForSearch =
false;
381 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
382 hasKeywordsForSearch =
true;
387 if (!settings.isIndexToSolrEnabled()) {
389 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.indexingDisabled")));
391 if (!hasKeywordsForSearch) {
393 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
400 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
401 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
402 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
405 stringsExtractionContext = Lookups.fixed(stringsConfig);
412 if (initialized ==
false)
414 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.
getName());
425 Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
427 String mimeType = fileTypeDetector.
getMIMEType(abstractFile).trim().toLowerCase();
431 if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
437 if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
438 && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
448 searchFile(extractorOpt, abstractFile, mimeType,
false);
456 searchFile(extractorOpt, abstractFile, mimeType,
true);
467 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
469 if ((initialized ==
false) || (context == null)) {
474 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping due to ingest cancellation", instanceNum);
483 InlineSearcher.makeArtifacts(context);
484 InlineSearcher.cleanup(context);
485 Ingester.getDefault().commit();
487 logger.log(Level.SEVERE, String.format(
"Failed to create search ingest artifacts for job %d", context.
getJobId()), ex);
492 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
494 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
496 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
499 synchronized (ingestStatus) {
500 ingestStatus.remove(jobId);
511 stringsExtractionContext = null;
526 if (OCR_DOCUMENTS.contains(mimeType)) {
530 if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
531 return aFile.
getSize() > LIMITED_OCR_SIZE_MIN
542 int text_ingested = 0;
543 int metadata_ingested = 0;
544 int strings_ingested = 0;
549 synchronized (ingestStatus) {
550 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
551 if (ingestStatusForJob == null) {
559 case METADATA_INGESTED:
562 case STRINGS_INGESTED:
565 case SKIPPED_ERROR_TEXTEXTRACT:
568 case SKIPPED_ERROR_INDEXING:
571 case SKIPPED_ERROR_IO:
580 StringBuilder msg =
new StringBuilder();
581 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
582 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
583 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
584 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
585 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
586 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
587 msg.append(
"</table>");
588 String indexStats = msg.toString();
589 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
591 if (error_index > 0) {
593 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
594 }
else if (error_io + error_text > 0) {
595 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
596 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
604 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
608 return Optional.empty();
632 Map<String, String> extractedMetadata)
throws IngesterException {
635 if (!extractorOptional.isPresent()) {
639 Ingester.getDefault().search(getTikaOrTextExtractor(extractorOptional, aFile, extractedMetadata), aFile.getId(), aFile.getName(), aFile, context,
true,settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
643 }
catch(Exception ex) {
644 logger.log(Level.WARNING, String.format(
"Failed to search file %s [id=%d]",
645 aFile.getName(), aFile.getId()), ex);
659 Map<String, String> metadata = extractor.
getMetadata();
660 if (!metadata.isEmpty()) {
664 extractedMetadata.putAll(metadata);
666 CharSource formattedMetadata = getMetaDataCharSource(metadata);
668 finalReader = CharSource.concat(
new CharSource() {
671 public Reader openStream()
throws IOException {
674 }, formattedMetadata).openStream();
675 }
catch (IOException ex) {
676 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
677 aFile.getName(), aFile.getId()), ex);
679 finalReader = fileText;
690 Collection<BlackboardAttribute> attributes =
new ArrayList<>();
691 Collection<BlackboardArtifact> bbartifacts =
new ArrayList<>();
702 for (Map.Entry<String, String> entry : metadata.entrySet()) {
703 if (entry.getValue() != null) {
705 if (attrPair != null && attrPair.getKey() != null && attrPair.getValue() != null) {
706 intermediateMapping.compute(attrPair.getKey(), (k, v) -> {
707 if (v == null || v.getKey() > attrPair.getValue()) {
708 return Pair.of(attrPair.getValue(), entry.getValue());
718 BlackboardAttribute attribute = checkAttribute(interEntry.getKey(), interEntry.getValue().getValue());
719 if (attribute != null) {
720 attributes.add(attribute);
724 if (!attributes.isEmpty()) {
727 bbartifacts.add(bbart);
730 logger.log(Level.WARNING, String.format(
"Error creating or adding metadata artifact for file %s.", aFile.
getParentPath() + aFile.
getName()), ex);
733 if (!bbartifacts.isEmpty()) {
738 logger.log(Level.WARNING, String.format(
"Unable to post blackboard artifacts for file $s.", aFile.
getParentPath() + aFile.
getName()), ex);
754 if (attrType != null && !value.isEmpty() && value.charAt(0) !=
' ') {
756 SimpleDateFormat metadataDateFormat =
new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss", US);
757 Long metadataDateTime = Long.valueOf(0);
759 String metadataDate = value.replaceAll(
"T",
" ").replaceAll(
"Z",
"");
760 Date usedDate = metadataDateFormat.parse(metadataDate);
761 metadataDateTime = usedDate.getTime() / 1000;
763 }
catch (ParseException ex) {
765 logger.log(Level.WARNING, String.format(
"Failed to parse date/time %s for metadata attribute %s.", value, attrType == null ?
"<null>" : attrType.name()), ex);
785 "KeywordSearchIngestModule.metadataTitle=METADATA"
787 static CharSource getMetaDataCharSource(Map<String, String> metadata) {
788 return CharSource.wrap(
new StringBuilder(
789 String.format(
"\n\n------------------------------%s------------------------------\n\n",
790 Bundle.KeywordSearchIngestModule_metadataTitle()))
791 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
792 .map(entry -> entry.getKey() +
": " + entry.getValue())
793 .collect(Collectors.joining(
"\n"))
809 Reader extractedTextReader = KeywordSearchUtil.getReader(aFile, stringsExtractionContext);
812 }
catch (Exception ex) {
813 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.
getName() +
"' (id: " + aFile.
getId() +
").", ex);
847 extractStringsAndIndex(aFile);
851 final long size = aFile.
getSize();
854 if ((indexContent ==
false || aFile.
isDir() || size == 0)) {
859 ingester.indexMetaDataOnly(aFile);
861 }
catch (IngesterException ex) {
863 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.
getId(), ex);
874 if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
879 ingester.indexMetaDataOnly(aFile);
881 }
catch (IngesterException ex) {
883 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.
getId(), ex);
888 boolean wasTextAdded =
false;
889 Map<String, String> extractedMetadata =
new HashMap<>();
897 if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
898 extractStringsAndIndex(aFile);
901 if (!extractTextAndSearch(extractor, aFile, extractedMetadata)) {
909 }
catch (IngesterException e) {
910 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.
getId() +
", "
913 }
catch (Exception e) {
914 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.
getId() +
", "
922 wasTextAdded = searchTextFile(aFile);
926 if (wasTextAdded ==
false) {
927 extractStringsAndIndex(aFile);
933 if (!extractedMetadata.isEmpty()) {
934 createMetadataArtifact(aFile, extractedMetadata);
947 Reader textReader = textFileExtractor.
getReader();
948 if (textReader == null) {
949 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.
getName());
951 Ingester.getDefault().search(textReader, aFile.
getId(), aFile.
getName(), aFile, context,
true, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
956 }
catch (Exception ex) {
957 logger.log(Level.WARNING,
"Unable to index " + aFile.
getName(), ex);
BlackboardAttribute checkAttribute(BlackboardAttribute.ATTRIBUTE_TYPE attrType, String value)
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
TSK_LAST_PRINTED_DATETIME
Blackboard getBlackboard()
void postArtifacts(Collection< BlackboardArtifact > artifacts, String moduleName)
Reader getTikaOrTextExtractor(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
TskData.TSK_DB_FILES_TYPE_ENUM getType()
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean searchTextFile(AbstractFile aFile)
void startUp(IngestJobContext context)
String getNameExtension()
static synchronized Server getServer()
void searchFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractTextAndSearch(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
DataArtifact newDataArtifact(BlackboardArtifact.Type artifactType, Collection< BlackboardAttribute > attributesList)
boolean isLimitedOCRFile(AbstractFile aFile, String mimeType)
TskData.FileKnown getKnown()
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
SleuthkitCase getSleuthkitCase()
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
Lookup stringsExtractionContext
static void warn(String title, String message)
boolean extractStringsAndIndex(AbstractFile aFile)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.