19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.util.ArrayList;
 
   22 import java.util.HashMap;
 
   23 import java.util.List;
 
   25 import java.util.concurrent.atomic.AtomicInteger;
 
   26 import java.util.logging.Level;
 
   27 import org.openide.util.NbBundle;
 
   28 import org.openide.util.NbBundle.Messages;
 
   57     "# {0} - Reason for not starting Solr", 
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
 
   58     "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
 
   59     "SolrConnectionCheck.Port=Invalid port number.",
 
   60     "# {0} - Reason for not connecting to Solr", 
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
 
   61     "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
 
   62     "CannotRunFileTypeDetection=Unable to run file type detection." 
   66     enum UpdateFrequency {
 
   72         NONE(Integer.MAX_VALUE),
 
   74         private final int time;
 
   76         UpdateFrequency(
int time) {
 
   86     private Ingester ingester = null;
 
   92     private boolean startedSearching = 
false;
 
   95     private final KeywordSearchJobSettings 
settings;
 
   96     private boolean initialized = 
false;
 
   99     private static final AtomicInteger instanceCount = 
new AtomicInteger(0); 
 
  100     private int instanceNum = 0;
 
  113     private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = 
new HashMap<>(); 
 
  124         synchronized (ingestStatus) {
 
  125             Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
 
  126             if (ingestStatusForJob == null) {
 
  127                 ingestStatusForJob = 
new HashMap<>();
 
  128                 ingestStatus.put(ingestJobId, ingestStatusForJob);
 
  130             ingestStatusForJob.put(fileId, status);
 
  131             ingestStatus.put(ingestJobId, ingestStatusForJob);
 
  136         this.settings = settings;
 
  137         instanceNum = instanceCount.getAndIncrement();
 
  146         "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
 
  147         "# {0} - Solr version number", 
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
 
  148         "# {0} - schema version number", 
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index." 
  157         if (server.coreIsOpen() == 
false) {
 
  162             Index indexInfo = server.getIndexInfo();
 
  163             if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
 
  164                 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));                                
 
  166             if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
 
  167                 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));                
 
  170             throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
 
  179         ingester = Ingester.getDefault();
 
  180         this.context = context;
 
  191                 } 
catch (NumberFormatException ex) {
 
  193                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + 
" " + Bundle.SolrConnectionCheck_Port(), ex);
 
  203                     if (!server.isRunning()) {
 
  204                         throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
 
  208                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
 
  215                     throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
 
  219                 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
 
  220                 boolean hasKeywordsForSearch = 
false;
 
  222                     if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
 
  223                         hasKeywordsForSearch = 
true;
 
  227                 if (!hasKeywordsForSearch) {
 
  229                             NbBundle.getMessage(this.getClass(), 
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
 
  235         stringExtractor = 
new StringsTextExtractor();
 
  236         stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
 
  237         stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
 
  239         textExtractors = 
new ArrayList<>();
 
  241         textExtractors.add(
new HtmlTextExtractor());
 
  242         textExtractors.add(
new TikaTextExtractor());
 
  250         if (initialized == 
false) 
 
  252             logger.log(Level.WARNING, 
"Skipping processing, module not initialized, file: {0}", abstractFile.
getName());  
 
  278         if (!startedSearching) {
 
  282             List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
 
  284             startedSearching = 
true;
 
  296         logger.log(Level.INFO, 
"Instance {0}", instanceNum); 
 
  298         if ((initialized == 
false) || (context == null)) {
 
  313             synchronized (ingestStatus) {
 
  314                 ingestStatus.remove(jobId);
 
  323             logger.log(Level.INFO, 
"Indexed files count: {0}", numIndexedFiles); 
 
  324             logger.log(Level.INFO, 
"Indexed file chunks count: {0}", numIndexedChunks); 
 
  326             logger.log(Level.WARNING, 
"Error executing Solr query to check number of indexed files/chunks: ", ex); 
 
  336         logger.log(Level.INFO, 
"stop()"); 
 
  347         textExtractors.clear();
 
  348         textExtractors = null;
 
  349         stringExtractor = null;
 
  358         int text_ingested = 0;
 
  359         int metadata_ingested = 0;
 
  360         int strings_ingested = 0;
 
  365         synchronized (ingestStatus) {
 
  366             Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
 
  367             if (ingestStatusForJob == null) {
 
  375                     case METADATA_INGESTED:
 
  378                     case STRINGS_INGESTED:
 
  381                     case SKIPPED_ERROR_TEXTEXTRACT:
 
  384                     case SKIPPED_ERROR_INDEXING:
 
  387                     case SKIPPED_ERROR_IO:
 
  396         StringBuilder msg = 
new StringBuilder();
 
  397         msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>"); 
 
  398         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>"); 
 
  399         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>"); 
 
  400         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>"); 
 
  401         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>"); 
 
  402         msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>"); 
 
  403         msg.append(
"</table>"); 
 
  404         String indexStats = msg.toString();
 
  405         logger.log(Level.INFO, 
"Keyword Indexing Completed: {0}", indexStats); 
 
  407         if (error_index > 0) {
 
  409                     NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
 
  410         } 
else if (error_io + error_text > 0) {
 
  411             MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
 
  412                     NbBundle.getMessage(
this.getClass(), 
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
 
  438             FileTextExtractor extractor = null;
 
  441             for (FileTextExtractor fe : textExtractors) {
 
  442                 if (fe.isSupported(aFile, detectedFormat)) {
 
  448             if (extractor == null) {
 
  449                 logger.log(Level.INFO, 
"No text extractor found for file id:{0}, name: {1}, detected format: {2}", 
new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); 
 
  455             return Ingester.getDefault().indexText(extractor, aFile, context);
 
  475                     logger.log(Level.WARNING, 
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).", 
new Object[]{aFile.getName(), aFile.getId()});  
 
  479             } 
catch (IngesterException ex) {
 
  480                 logger.log(Level.WARNING, 
"Failed to extract strings and ingest, file '" + aFile.
getName() + 
"' (id: " + aFile.
getId() + 
").", ex);  
 
  503                 extractStringsAndIndex(aFile);
 
  507             final long size = aFile.
getSize();
 
  510             if ((indexContent == 
false || aFile.
isDir() || size == 0)) {
 
  515                     ingester.indexMetaDataOnly(aFile);
 
  517                 } 
catch (IngesterException ex) {
 
  519                     logger.log(Level.WARNING, 
"Unable to index meta-data for file: " + aFile.
getId(), ex); 
 
  531                 logger.log(Level.SEVERE, String.format(
"Could not detect format using fileTypeDetector for file: %s", aFile), ex); 
 
  537             if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
 
  542                     ingester.indexMetaDataOnly(aFile);
 
  544                 } 
catch (IngesterException ex) {
 
  546                     logger.log(Level.WARNING, 
"Unable to index meta-data for file: " + aFile.
getId(), ex); 
 
  551             boolean wasTextAdded = 
false;
 
  559                 if (fileType.equals(
"application/octet-stream")) {
 
  560                     extractStringsAndIndex(aFile);
 
  563                 if (!extractTextAndIndex(aFile, fileType)) {
 
  564                     logger.log(Level.WARNING, 
"Text extractor not found for file. Extracting strings only. File: ''{0}'' (id:{1}).", 
new Object[]{aFile.getName(), aFile.getId()}); 
 
  571             } 
catch (IngesterException e) {
 
  572                 logger.log(Level.INFO, 
"Could not extract text with Tika, " + aFile.
getId() + 
", "  
  575             } 
catch (Exception e) {
 
  576                 logger.log(Level.WARNING, 
"Error extracting text with Tika, " + aFile.
getId() + 
", "  
  582             if (wasTextAdded == 
false) {
 
  583                 extractStringsAndIndex(aFile);
 
int queryNumIndexedFiles()
 
FileTypeDetector fileTypeDetector
 
synchronized long decrementAndGet(long jobId)
 
int queryNumIndexedChunks()
 
List< FileTextExtractor > textExtractors
 
void tryConnect(String host, int port)
 
static String getIndexingServerPort()
 
TskData.TSK_DB_FILES_TYPE_ENUM getType()
 
METADATA_INGESTED
No content, so we just text_ingested metadata. 
 
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
 
StringsTextExtractor stringExtractor
 
void startUp(IngestJobContext context)
 
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
 
static synchronized Server getServer()
 
synchronized long incrementAndGet(long jobId)
 
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
 
final KeywordSearchJobSettings settings
 
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems. 
 
boolean extractStringsAndIndex(AbstractFile aFile)
 
TskData.FileKnown getKnown()
 
static synchronized SearchRunner getInstance()
 
int queryNumIndexedDocuments()
 
void postMessage(final IngestMessage message)
 
boolean fileIngestIsCancelled()
 
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues. 
 
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
 
ProcessResult process(AbstractFile abstractFile)
 
static void error(String title, String message)
 
void indexFile(AbstractFile aFile, boolean indexContent)
 
static Case getCurrentCase()
 
synchronized static Logger getLogger(String name)
 
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
 
String getFileType(AbstractFile file)
 
static String getIndexingServerHost()
 
static void warn(String title, String message)
 
static synchronized IngestServices getInstance()
 
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.