19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.io.CharSource;
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.text.ParseException;
27 import java.text.SimpleDateFormat;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.List;
33 import static java.util.Locale.US;
35 import java.util.concurrent.atomic.AtomicInteger;
36 import java.util.logging.Level;
37 import java.util.stream.Collectors;
38 import org.apache.tika.mime.MimeTypes;
39 import org.openide.util.Lookup;
40 import org.openide.util.NbBundle;
41 import org.openide.util.NbBundle.Messages;
42 import org.openide.util.lookup.Lookups;
81 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
82 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
83 "SolrConnectionCheck.Port=Invalid port number.",
84 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
85 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
86 "CannotRunFileTypeDetection=Unable to run file type detection."
94 private static final List<String> ARCHIVE_MIME_TYPES
97 "application/x-7z-compressed",
98 "application/x-ace-compressed",
99 "application/x-alz-compressed",
101 "application/vnd.ms-cab-compressed",
102 "application/x-cfs-compressed",
103 "application/x-dgc-compressed",
104 "application/x-apple-diskimage",
105 "application/x-gca-compressed",
109 "application/x-rar-compressed",
110 "application/x-stuffit",
111 "application/x-stuffitx",
112 "application/x-gtar",
113 "application/x-archive",
114 "application/x-executable",
115 "application/x-gzip",
118 "application/x-cpio",
119 "application/x-shar",
121 "application/x-bzip",
122 "application/x-bzip2",
123 "application/x-lzip",
124 "application/x-lzma",
125 "application/x-lzop",
127 "application/x-compress");
129 private static final List<String> METADATA_DATE_TYPES
135 private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
136 .put(
"Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
137 .put(
"Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
138 .put(
"Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
139 .put(
"Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
140 .put(
"Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
141 .put(
"Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
142 .put(
"Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
143 .put(
"Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
144 .put(
"Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
145 .put(
"pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
152 enum StringsExtractOptions {
157 enum UpdateFrequency {
163 NONE(Integer.MAX_VALUE),
165 private final int time;
167 UpdateFrequency(
int time) {
177 private Ingester ingester = null;
183 private boolean startedSearching =
false;
186 private boolean initialized =
false;
188 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
189 private int instanceNum = 0;
202 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
213 synchronized (ingestStatus) {
214 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
215 if (ingestStatusForJob == null) {
216 ingestStatusForJob =
new HashMap<>();
217 ingestStatus.put(ingestJobId, ingestStatusForJob);
219 ingestStatusForJob.put(fileId, status);
220 ingestStatus.put(ingestJobId, ingestStatusForJob);
225 this.settings = settings;
226 instanceNum = instanceCount.getAndIncrement();
235 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
236 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
237 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
238 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
246 if (server.coreIsOpen() ==
false) {
251 Index indexInfo = server.getIndexInfo();
252 if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
253 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
256 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
265 ingester = Ingester.getDefault();
266 this.context = context;
283 port = Integer.parseInt(properties.getPort());
284 }
catch (NumberFormatException ex) {
286 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
289 kwsService.
tryConnect(properties.getHost(), port);
296 if (!server.isLocalSolrRunning()) {
297 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
301 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
308 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
312 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
313 boolean hasKeywordsForSearch =
false;
315 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
316 hasKeywordsForSearch =
true;
320 if (!hasKeywordsForSearch) {
322 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
328 Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
329 stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
330 stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
333 stringsExtractionContext = Lookups.fixed(stringsConfig);
341 if (initialized ==
false)
343 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
348 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
353 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
369 if (!startedSearching) {
373 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
374 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
375 startedSearching =
true;
387 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
389 if ((initialized ==
false) || (context == null)) {
394 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
395 IngestSearchRunner.getInstance().stopJob(jobId);
401 IngestSearchRunner.getInstance().endJob(jobId);
407 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
409 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
411 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
414 synchronized (ingestStatus) {
415 ingestStatus.remove(jobId);
426 stringsExtractionContext = null;
434 int text_ingested = 0;
435 int metadata_ingested = 0;
436 int strings_ingested = 0;
441 synchronized (ingestStatus) {
442 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
443 if (ingestStatusForJob == null) {
451 case METADATA_INGESTED:
454 case STRINGS_INGESTED:
457 case SKIPPED_ERROR_TEXTEXTRACT:
460 case SKIPPED_ERROR_INDEXING:
463 case SKIPPED_ERROR_IO:
472 StringBuilder msg =
new StringBuilder();
473 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
474 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
475 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
476 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
477 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
478 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
479 msg.append(
"</table>");
480 String indexStats = msg.toString();
481 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
483 if (error_index > 0) {
485 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
486 }
else if (error_io + error_text > 0) {
487 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
488 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
513 private boolean extractTextAndIndex(AbstractFile aFile, Map<String, String> extractedMetadata)
throws IngesterException {
515 imageConfig.
setOCREnabled(KeywordSearchSettings.getOcrOption());
517 Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
525 Map<String, String> metadata = extractor.
getMetadata();
526 if (!metadata.isEmpty()) {
530 extractedMetadata.putAll(metadata);
532 CharSource formattedMetadata = getMetaDataCharSource(metadata);
534 finalReader = CharSource.concat(
new CharSource() {
537 public Reader openStream()
throws IOException {
540 }, formattedMetadata).openStream();
541 }
catch (IOException ex) {
542 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
543 aFile.getName(), aFile.getId()), ex);
545 finalReader = fileText;
548 return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
559 Collection<BlackboardAttribute> attributes =
new ArrayList<>();
560 Collection<BlackboardArtifact> bbartifacts =
new ArrayList<>();
561 for (Map.Entry<String, String> entry : metadata.entrySet()) {
562 if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
563 BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
569 if (!attributes.isEmpty()) {
571 BlackboardArtifact bbart = aFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA);
572 bbart.addAttributes(attributes);
573 bbartifacts.add(bbart);
574 }
catch (TskCoreException ex) {
576 logger.log(Level.WARNING, String.format(
"Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex);
579 if (!bbartifacts.isEmpty()) {
584 logger.log(Level.WARNING, String.format(
"Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()) , ex);
594 if (!value.isEmpty() && value.charAt(0) !=
' ') {
595 if (METADATA_DATE_TYPES.contains(key)) {
596 SimpleDateFormat metadataDateFormat =
new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss", US);
597 Long metadataDateTime = Long.valueOf(0);
599 String metadataDate = value.replaceAll(
"T",
" ").replaceAll(
"Z",
"");
600 Date usedDate = metadataDateFormat.parse(metadataDate);
601 metadataDateTime = usedDate.getTime()/1000;
602 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
603 }
catch (ParseException ex) {
605 logger.log(Level.WARNING, String.format(
"Failed to parse date/time %s for metadata attribute %s.", value, key), ex);
609 return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
626 "KeywordSearchIngestModule.metadataTitle=METADATA"
629 return CharSource.wrap(
new StringBuilder(
630 String.format(
"\n\n------------------------------%s------------------------------\n\n",
631 Bundle.KeywordSearchIngestModule_metadataTitle()))
632 .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
633 .map(entry -> entry.getKey() +
": " + entry.getValue())
634 .collect(Collectors.joining(
"\n"))
652 Reader extractedTextReader = stringsExtractor.
getReader();
657 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
662 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
675 private void indexFile(AbstractFile aFile,
boolean indexContent) {
678 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
687 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
688 || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
689 || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase(
"txt"))) {
693 extractStringsAndIndex(aFile);
697 final long size = aFile.getSize();
700 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
705 ingester.indexMetaDataOnly(aFile);
707 }
catch (IngesterException ex) {
709 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
717 String fileType = fileTypeDetector.
getMIMEType(aFile);
721 if (ARCHIVE_MIME_TYPES.contains(fileType)) {
726 ingester.indexMetaDataOnly(aFile);
728 }
catch (IngesterException ex) {
730 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
735 boolean wasTextAdded =
false;
736 Map<String, String> extractedMetadata =
new HashMap<>();
744 if (fileType.equals(MimeTypes.OCTET_STREAM)) {
745 extractStringsAndIndex(aFile);
748 if (!extractTextAndIndex(aFile, extractedMetadata)) {
756 }
catch (IngesterException e) {
757 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
758 + aFile.getName(), e);
760 }
catch (Exception e) {
761 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
762 + aFile.getName(), e);
766 if ((wasTextAdded ==
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
769 wasTextAdded = indexTextFile(aFile);
773 if (wasTextAdded ==
false) {
774 extractStringsAndIndex(aFile);
780 if (!extractedMetadata.isEmpty()) {
781 createMetadataArtifact(aFile, extractedMetadata);
794 Reader textReader = textFileExtractor.
getReader();
795 if (textReader == null) {
796 logger.log(Level.INFO,
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
797 }
else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
803 logger.log(Level.WARNING,
"Unable to index " + aFile.getName(), ex);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
CharSource getMetaDataCharSource(Map< String, String > metadata)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean extractTextAndIndex(AbstractFile aFile, Map< String, String > extractedMetadata)
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
boolean indexTextFile(AbstractFile aFile)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
SleuthkitCase getSleuthkitCase()
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
BlackboardAttribute checkAttribute(String key, String value)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Lookup stringsExtractionContext
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.