19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
57 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
58 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
59 "SolrConnectionCheck.Port=Invalid port number.",
60 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
61 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
62 "CannotRunFileTypeDetection=Unable to run file type detection."
66 enum UpdateFrequency {
72 NONE(Integer.MAX_VALUE),
74 private final int time;
76 UpdateFrequency(
int time) {
86 private Ingester ingester = null;
92 private boolean startedSearching =
false;
95 private final KeywordSearchJobSettings
settings;
96 private boolean initialized =
false;
99 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
100 private int instanceNum = 0;
113 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
124 synchronized (ingestStatus) {
125 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
126 if (ingestStatusForJob == null) {
127 ingestStatusForJob =
new HashMap<>();
128 ingestStatus.put(ingestJobId, ingestStatusForJob);
130 ingestStatusForJob.put(fileId, status);
131 ingestStatus.put(ingestJobId, ingestStatusForJob);
136 this.settings = settings;
137 instanceNum = instanceCount.getAndIncrement();
146 "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
147 "# {0} - Solr version number",
"KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
148 "# {0} - schema version number",
"KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
149 "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
158 if (server.coreIsOpen() ==
false) {
163 Index indexInfo = server.getIndexInfo();
164 if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
165 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
167 if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
168 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
171 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
180 ingester = Ingester.getDefault();
181 this.context = context;
198 port = Integer.parseInt(properties.getPort());
199 }
catch (NumberFormatException ex) {
201 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
204 kwsService.
tryConnect(properties.getHost(), port);
211 if (!server.isRunning()) {
212 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
216 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
223 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
227 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
228 boolean hasKeywordsForSearch =
false;
230 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
231 hasKeywordsForSearch =
true;
235 if (!hasKeywordsForSearch) {
237 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
243 stringExtractor =
new StringsTextExtractor();
244 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
245 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
247 textExtractors =
new ArrayList<>();
249 textExtractors.add(
new HtmlTextExtractor());
250 textExtractors.add(
new TikaTextExtractor());
258 if (initialized ==
false)
260 logger.log(Level.SEVERE,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
265 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
270 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
286 if (!startedSearching) {
290 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
291 IngestSearchRunner.getInstance().startJob(context, keywordListNames);
292 startedSearching =
true;
304 logger.log(Level.INFO,
"Keyword search ingest module instance {0} shutting down", instanceNum);
306 if ((initialized ==
false) || (context == null)) {
311 logger.log(Level.INFO,
"Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum);
312 IngestSearchRunner.getInstance().stopJob(jobId);
318 IngestSearchRunner.getInstance().endJob(jobId);
324 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
326 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
328 logger.log(Level.SEVERE,
"Error executing Solr queries to check number of indexed files and file chunks", ex);
331 synchronized (ingestStatus) {
332 ingestStatus.remove(jobId);
343 textExtractors.clear();
344 textExtractors = null;
345 stringExtractor = null;
354 int text_ingested = 0;
355 int metadata_ingested = 0;
356 int strings_ingested = 0;
361 synchronized (ingestStatus) {
362 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
363 if (ingestStatusForJob == null) {
371 case METADATA_INGESTED:
374 case STRINGS_INGESTED:
377 case SKIPPED_ERROR_TEXTEXTRACT:
380 case SKIPPED_ERROR_INDEXING:
383 case SKIPPED_ERROR_IO:
392 StringBuilder msg =
new StringBuilder();
393 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
394 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
395 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
396 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
397 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
398 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
399 msg.append(
"</table>");
400 String indexStats = msg.toString();
401 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
403 if (error_index > 0) {
405 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
406 }
else if (error_io + error_text > 0) {
407 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
408 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
434 ContentTextExtractor extractor = null;
437 for (ContentTextExtractor fe : textExtractors) {
438 if (fe.isSupported(aFile, detectedFormat)) {
444 if (extractor == null) {
451 return Ingester.getDefault().indexText(extractor, aFile, context);
471 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
475 }
catch (IngesterException ex) {
476 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
489 private void indexFile(AbstractFile aFile,
boolean indexContent) {
492 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
495 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
499 extractStringsAndIndex(aFile);
503 final long size = aFile.getSize();
506 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
511 ingester.indexMetaDataOnly(aFile);
513 }
catch (IngesterException ex) {
515 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
523 String fileType = fileTypeDetector.
getMIMEType(aFile);
527 if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
532 ingester.indexMetaDataOnly(aFile);
534 }
catch (IngesterException ex) {
536 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
541 boolean wasTextAdded =
false;
549 if (fileType.equals(
"application/octet-stream")) {
550 extractStringsAndIndex(aFile);
553 if (!extractTextAndIndex(aFile, fileType)) {
561 }
catch (IngesterException e) {
562 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
563 + aFile.getName(), e);
565 }
catch (Exception e) {
566 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
567 + aFile.getName(), e);
572 if (wasTextAdded ==
false) {
573 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
List< ContentTextExtractor > textExtractors
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
void tryConnect(String host, int port)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
METADATA_INGESTED
No content, so we just text_ingested metadata.
String getCaseDirectory()
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
String getMIMEType(AbstractFile file)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.