19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import com.google.common.collect.ImmutableList;
 
   22 import com.google.common.io.CharSource;
 
   23 import java.io.IOException;
 
   24 import java.io.Reader;
 
   25 import java.util.HashMap;
 
   26 import java.util.List;
 
   28 import java.util.concurrent.atomic.AtomicInteger;
 
   29 import java.util.logging.Level;
 
   30 import java.util.stream.Collectors;
 
   31 import org.openide.util.Lookup;
 
   32 import org.openide.util.NbBundle;
 
   33 import org.openide.util.NbBundle.Messages;
 
   34 import org.openide.util.lookup.Lookups;
 
   68     "# {0} - Reason for not starting Solr", 
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
 
   69     "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
 
   70     "SolrConnectionCheck.Port=Invalid port number.",
 
   71     "# {0} - Reason for not connecting to Solr", 
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
 
   72     "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
 
   73     "CannotRunFileTypeDetection=Unable to run file type detection." 
   81     private static final List<String> ARCHIVE_MIME_TYPES
 
   84                     "application/x-7z-compressed", 
 
   85                     "application/x-ace-compressed", 
 
   86                     "application/x-alz-compressed", 
 
   88                     "application/vnd.ms-cab-compressed", 
 
   89                     "application/x-cfs-compressed", 
 
   90                     "application/x-dgc-compressed", 
 
   91                     "application/x-apple-diskimage", 
 
   92                     "application/x-gca-compressed", 
 
   96                     "application/x-rar-compressed", 
 
   97                     "application/x-stuffit", 
 
   98                     "application/x-stuffitx", 
 
  100                     "application/x-archive", 
 
  101                     "application/x-executable", 
 
  102                     "application/x-gzip", 
 
  105                     "application/x-cpio", 
 
  106                     "application/x-shar", 
 
  108                     "application/x-bzip", 
 
  109                     "application/x-bzip2", 
 
  110                     "application/x-lzip", 
 
  111                     "application/x-lzma", 
 
  112                     "application/x-lzop", 
 
  114                     "application/x-compress"); 
 
  119     enum StringsExtractOptions {
 
  124     enum UpdateFrequency {
 
  130         NONE(Integer.MAX_VALUE),
 
  132         private final int time;
 
  134         UpdateFrequency(
int time) {
 
  144     private Ingester ingester = null;
 
  150     private boolean startedSearching = 
false;
 
  153     private boolean initialized = 
false;
 
  155     private static final AtomicInteger instanceCount = 
new AtomicInteger(0); 
 
  156     private int instanceNum = 0;
 
  169     private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = 
new HashMap<>(); 
 
  180         synchronized (ingestStatus) {
 
  181             Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
 
  182             if (ingestStatusForJob == null) {
 
  183                 ingestStatusForJob = 
new HashMap<>();
 
  184                 ingestStatus.put(ingestJobId, ingestStatusForJob);
 
  186             ingestStatusForJob.put(fileId, status);
 
  187             ingestStatus.put(ingestJobId, ingestStatusForJob);
 
  192         this.settings = settings;
 
  193         instanceNum = instanceCount.getAndIncrement();
 
  202         "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
 
  203         "# {0} - Solr version number", 
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
 
  204         "# {0} - schema version number", 
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
 
  205         "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available." 
  213         if (server.coreIsOpen() == 
false) {
 
  218             Index indexInfo = server.getIndexInfo();
 
  219             if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
 
  220                 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
 
  222             if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
 
  223                 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
 
  226             throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
 
  235         ingester = Ingester.getDefault();
 
  236         this.context = context;
 
  253                     port = Integer.parseInt(properties.getPort());
 
  254                 } 
catch (NumberFormatException ex) {
 
  256                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + 
" " + Bundle.SolrConnectionCheck_Port(), ex);
 
  259                     kwsService.
tryConnect(properties.getHost(), port);
 
  266                     if (!server.isRunning()) {
 
  267                         throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
 
  271                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
 
  278                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
 
  282                 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
 
  283                 boolean hasKeywordsForSearch = 
false;
 
  285                     if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
 
  286                         hasKeywordsForSearch = 
true;
 
  290                 if (!hasKeywordsForSearch) {
 
  292                             NbBundle.getMessage(this.getClass(), 
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
 
  298         Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
 
  299         stringsConfig.
setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
 
  300         stringsConfig.
setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
 
  303         stringsExtractionContext = Lookups.fixed(stringsConfig);
 
  311         if (initialized == 
false) 
 
  313             logger.log(Level.SEVERE, 
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());  
 
  318         if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
 
  323         if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
 
  339         if (!startedSearching) {
 
  343             List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
 
  344             IngestSearchRunner.getInstance().startJob(context, keywordListNames);
 
  345             startedSearching = 
true;
 
  357         logger.log(Level.INFO, 
"Keyword search ingest module instance {0} shutting down", instanceNum); 
 
  359         if ((initialized == 
false) || (context == null)) {
 
  364             logger.log(Level.INFO, 
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); 
 
  365             IngestSearchRunner.getInstance().stopJob(jobId);
 
  371         IngestSearchRunner.getInstance().endJob(jobId);
 
  377                 logger.log(Level.INFO, 
"Indexed files count: {0}", numIndexedFiles); 
 
  379                 logger.log(Level.INFO, 
"Indexed file chunks count: {0}", numIndexedChunks); 
 
  381                 logger.log(Level.SEVERE, 
"Error executing Solr queries to check number of indexed files and file chunks", ex); 
 
  384             synchronized (ingestStatus) {
 
  385                 ingestStatus.remove(jobId);
 
  396         stringsExtractionContext = null;
 
  404         int text_ingested = 0;
 
  405         int metadata_ingested = 0;
 
  406         int strings_ingested = 0;
 
  411         synchronized (ingestStatus) {
 
  412             Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
 
  413             if (ingestStatusForJob == null) {
 
  421                     case METADATA_INGESTED:
 
  424                     case STRINGS_INGESTED:
 
  427                     case SKIPPED_ERROR_TEXTEXTRACT:
 
  430                     case SKIPPED_ERROR_INDEXING:
 
  433                     case SKIPPED_ERROR_IO:
 
  442         StringBuilder msg = 
new StringBuilder();
 
  443         msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>"); 
 
  444         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>"); 
 
  445         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>"); 
 
  446         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>"); 
 
  447         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>"); 
 
  448         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>"); 
 
  449         msg.append(
"</table>"); 
 
  450         String indexStats = msg.toString();
 
  451         logger.log(Level.INFO, 
"Keyword Indexing Completed: {0}", indexStats); 
 
  453         if (error_index > 0) {
 
  455                     NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
 
  456         } 
else if (error_io + error_text > 0) {
 
  457             MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
 
  458                     NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
 
  484             imageConfig.
setOCREnabled(KeywordSearchSettings.getOcrOption());
 
  486             Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
 
  494                     Map<String, String> metadata = extractor.
getMetadata();
 
  495                     CharSource formattedMetadata = getMetaDataCharSource(metadata);
 
  497                     finalReader = CharSource.concat(
new CharSource() {
 
  500                         public Reader openStream() 
throws IOException {
 
  503                     }, formattedMetadata).openStream();
 
  504                 } 
catch (IOException ex) {
 
  505                     logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
 
  506                             aFile.getName(), aFile.getId()), ex);
 
  508                     finalReader = fileText;
 
  511                 return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
 
  526             "KeywordSearchIngestModule.metadataTitle=METADATA" 
  529             return CharSource.wrap(
new StringBuilder(
 
  530                     String.format(
"\n\n------------------------------%s------------------------------\n\n",
 
  531                             Bundle.KeywordSearchIngestModule_metadataTitle()))
 
  532                     .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
 
  533                             .map(entry -> entry.getKey() + 
": " + entry.getValue())
 
  534                             .collect(Collectors.joining(
"\n"))
 
  552                 Reader extractedTextReader = stringsExtractor.
getReader();
 
  557                     logger.log(Level.WARNING, 
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).", 
new Object[]{aFile.getName(), aFile.getId()});  
 
  562                 logger.log(Level.WARNING, 
"Failed to extract strings and ingest, file '" + aFile.getName() + 
"' (id: " + aFile.getId() + 
").", ex);  
 
  575         private void indexFile(AbstractFile aFile, 
boolean indexContent) {
 
  578             TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
 
  581             if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
 
  585                 extractStringsAndIndex(aFile);
 
  589             final long size = aFile.getSize();
 
  592             if ((indexContent == 
false || aFile.isDir() || size == 0)) {
 
  597                     ingester.indexMetaDataOnly(aFile);
 
  599                 } 
catch (IngesterException ex) {
 
  601                     logger.log(Level.WARNING, 
"Unable to index meta-data for file: " + aFile.getId(), ex); 
 
  609             String fileType = fileTypeDetector.
getMIMEType(aFile);
 
  613             if (ARCHIVE_MIME_TYPES.contains(fileType)) {
 
  618                     ingester.indexMetaDataOnly(aFile);
 
  620                 } 
catch (IngesterException ex) {
 
  622                     logger.log(Level.WARNING, 
"Unable to index meta-data for file: " + aFile.getId(), ex); 
 
  627             boolean wasTextAdded = 
false;
 
  635                 if (fileType.equals(
"application/octet-stream")) {
 
  636                     extractStringsAndIndex(aFile);
 
  639                 if (!extractTextAndIndex(aFile)) {
 
  647             } 
catch (IngesterException e) {
 
  648                 logger.log(Level.INFO, 
"Could not extract text with Tika, " + aFile.getId() + 
", "  
  649                         + aFile.getName(), e);
 
  651             } 
catch (Exception e) {
 
  652                 logger.log(Level.WARNING, 
"Error extracting text with Tika, " + aFile.getId() + 
", "  
  653                         + aFile.getName(), e);
 
  657             if ((wasTextAdded == 
false) && (aFile.getNameExtension().equalsIgnoreCase(
"txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
 
  661                     TextFileExtractor textFileExtractor = 
new TextFileExtractor();
 
  662                     Reader textReader = textFileExtractor.getReader(aFile);
 
  663                     if (textReader == null) {
 
  664                         logger.log(Level.INFO, 
"Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
 
  665                     } 
else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
 
  669                 } 
catch (IngesterException ex) {
 
  670                     logger.log(Level.WARNING, 
"Unable to index as unicode", ex);
 
  672                     logger.log(Level.INFO, 
"Could not extract text with TextFileExtractor", ex);
 
  677             if (wasTextAdded == 
false) {
 
  678                 extractStringsAndIndex(aFile);
 
int queryNumIndexedFiles()
 
FileTypeDetector fileTypeDetector
 
synchronized long decrementAndGet(long jobId)
 
CharSource getMetaDataCharSource(Map< String, String > metadata)
 
int queryNumIndexedChunks()
 
void tryConnect(String host, int port)
 
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
 
METADATA_INGESTED
No content, so we just text_ingested metadata. 
 
String getCaseDirectory()
 
void startUp(IngestJobContext context)
 
static synchronized Server getServer()
 
synchronized long incrementAndGet(long jobId)
 
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
 
String getMIMEType(AbstractFile file)
 
final KeywordSearchJobSettings settings
 
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems. 
 
boolean extractStringsAndIndex(AbstractFile aFile)
 
int queryNumIndexedDocuments()
 
void postMessage(final IngestMessage message)
 
boolean fileIngestIsCancelled()
 
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues. 
 
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
 
ProcessResult process(AbstractFile abstractFile)
 
static void error(String title, String message)
 
void indexFile(AbstractFile aFile, boolean indexContent)
 
synchronized static Logger getLogger(String name)
 
static Case getCurrentCaseThrows()
 
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
 
Lookup stringsExtractionContext
 
static void warn(String title, String message)
 
boolean extractTextAndIndex(AbstractFile aFile)
 
static synchronized IngestServices getInstance()
 
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.