19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
57 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
58 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
59 "SolrConnectionCheck.Port=Invalid port number.",
60 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
61 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
62 "CannotRunFileTypeDetection=Unable to run file type detection."
66 enum UpdateFrequency {
72 NONE(Integer.MAX_VALUE),
74 private final int time;
76 UpdateFrequency(
int time) {
86 private Ingester ingester = null;
92 private boolean startedSearching =
false;
95 private final KeywordSearchJobSettings
settings;
96 private boolean initialized =
false;
99 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
100 private int instanceNum = 0;
113 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
124 synchronized (ingestStatus) {
125 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
126 if (ingestStatusForJob == null) {
127 ingestStatusForJob =
new HashMap<>();
128 ingestStatus.put(ingestJobId, ingestStatusForJob);
130 ingestStatusForJob.put(fileId, status);
131 ingestStatus.put(ingestJobId, ingestStatusForJob);
136 this.settings = settings;
137 instanceNum = instanceCount.getAndIncrement();
146 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
147 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
148 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index."
157 if (server.coreIsOpen() ==
false) {
162 Index indexInfo = server.getIndexInfo();
163 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
164 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
166 if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
167 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
170 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
179 ingester = Ingester.getDefault();
180 this.context = context;
191 }
catch (NumberFormatException ex) {
193 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
203 if (!server.isRunning()) {
204 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
208 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
215 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
219 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
220 boolean hasKeywordsForSearch =
false;
222 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
223 hasKeywordsForSearch =
true;
227 if (!hasKeywordsForSearch) {
229 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
235 stringExtractor =
new StringsTextExtractor();
236 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
237 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
239 textExtractors =
new ArrayList<>();
241 textExtractors.add(
new HtmlTextExtractor());
242 textExtractors.add(
new TikaTextExtractor());
250 if (initialized ==
false)
252 logger.log(Level.WARNING,
"Skipping processing, module not initialized, file: {0}", abstractFile.
getName());
278 if (!startedSearching) {
282 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
284 startedSearching =
true;
296 logger.log(Level.INFO,
"Instance {0}", instanceNum);
298 if ((initialized ==
false) || (context == null)) {
313 synchronized (ingestStatus) {
314 ingestStatus.remove(jobId);
323 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
324 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
326 logger.log(Level.WARNING,
"Error executing Solr query to check number of indexed files/chunks: ", ex);
336 logger.log(Level.INFO,
"stop()");
347 textExtractors.clear();
348 textExtractors = null;
349 stringExtractor = null;
358 int text_ingested = 0;
359 int metadata_ingested = 0;
360 int strings_ingested = 0;
365 synchronized (ingestStatus) {
366 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
367 if (ingestStatusForJob == null) {
375 case METADATA_INGESTED:
378 case STRINGS_INGESTED:
381 case SKIPPED_ERROR_TEXTEXTRACT:
384 case SKIPPED_ERROR_INDEXING:
387 case SKIPPED_ERROR_IO:
396 StringBuilder msg =
new StringBuilder();
397 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
398 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
399 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
400 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
401 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
402 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
403 msg.append(
"</table>");
404 String indexStats = msg.toString();
405 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
407 if (error_index > 0) {
409 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
410 }
else if (error_io + error_text > 0) {
411 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
412 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
438 FileTextExtractor extractor = null;
441 for (FileTextExtractor fe : textExtractors) {
442 if (fe.isSupported(aFile, detectedFormat)) {
448 if (extractor == null) {
449 logger.log(Level.INFO,
"No text extractor found for file id:{0}, name: {1}, detected format: {2}",
new Object[]{aFile.getId(), aFile.getName(), detectedFormat});
455 return Ingester.getDefault().indexText(extractor, aFile, context);
475 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
479 }
catch (IngesterException ex) {
480 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.
getName() +
"' (id: " + aFile.
getId() +
").", ex);
503 extractStringsAndIndex(aFile);
507 final long size = aFile.
getSize();
510 if ((indexContent ==
false || aFile.
isDir() || size == 0)) {
515 ingester.indexMetaDataOnly(aFile);
517 }
catch (IngesterException ex) {
519 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.
getId(), ex);
531 logger.log(Level.SEVERE, String.format(
"Could not detect format using fileTypeDetector for file: %s", aFile), ex);
537 if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
542 ingester.indexMetaDataOnly(aFile);
544 }
catch (IngesterException ex) {
546 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.
getId(), ex);
551 boolean wasTextAdded =
false;
559 if (fileType.equals(
"application/octet-stream")) {
560 extractStringsAndIndex(aFile);
563 if (!extractTextAndIndex(aFile, fileType)) {
564 logger.log(Level.WARNING,
"Text extractor not found for file. Extracting strings only. File: ''{0}'' (id:{1}).",
new Object[]{aFile.getName(), aFile.getId()});
571 }
catch (IngesterException e) {
572 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.
getId() +
", "
575 }
catch (Exception e) {
576 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.
getId() +
", "
582 if (wasTextAdded ==
false) {
583 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
List< FileTextExtractor > textExtractors
void tryConnect(String host, int port)
static String getIndexingServerPort()
TskData.TSK_DB_FILES_TYPE_ENUM getType()
METADATA_INGESTED
No content, so we just text_ingested metadata.
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
TskData.FileKnown getKnown()
static synchronized SearchRunner getInstance()
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
static Case getCurrentCase()
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
String getFileType(AbstractFile file)
static String getIndexingServerHost()
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.