19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
56 "# {0} - Reason for not starting Solr",
"KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
57 "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
58 "SolrConnectionCheck.Port=Invalid port number.",
59 "# {0} - Reason for not connecting to Solr",
"KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
60 "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
61 "CannotRunFileTypeDetection=Unable to run file type detection."
65 enum UpdateFrequency {
71 NONE(Integer.MAX_VALUE),
73 private final int time;
75 UpdateFrequency(
int time) {
85 private Ingester ingester = null;
91 private boolean startedSearching =
false;
94 private final KeywordSearchJobSettings
settings;
95 private boolean initialized =
false;
98 private static final AtomicInteger instanceCount =
new AtomicInteger(0);
99 private int instanceNum = 0;
112 private static final Map<Long, Map<Long, IngestStatus>> ingestStatus =
new HashMap<>();
122 synchronized (ingestStatus) {
123 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
124 if (ingestStatusForJob == null) {
125 ingestStatusForJob =
new HashMap<>();
126 ingestStatus.put(ingestJobId, ingestStatusForJob);
128 ingestStatusForJob.put(fileId, status);
129 ingestStatus.put(ingestJobId, ingestStatusForJob);
134 this.settings = settings;
135 instanceNum = instanceCount.getAndIncrement();
150 if (server.coreIsOpen() ==
false) {
160 this.context = context;
171 }
catch (NumberFormatException ex) {
173 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() +
" " + Bundle.SolrConnectionCheck_Port(), ex);
183 if (!server.isRunning()) {
184 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
188 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
195 throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
199 List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
200 boolean hasKeywordsForSearch =
false;
202 if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
203 hasKeywordsForSearch =
true;
207 if (!hasKeywordsForSearch) {
209 NbBundle.getMessage(this.getClass(),
"KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
215 stringExtractor =
new StringsTextExtractor();
216 stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
217 stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
219 textExtractors =
new ArrayList<>();
221 textExtractors.add(
new HtmlTextExtractor());
222 textExtractors.add(
new TikaTextExtractor());
230 if (initialized ==
false)
232 logger.log(Level.WARNING,
"Skipping processing, module not initialized, file: {0}", abstractFile.getName());
237 if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
242 if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
258 if (!startedSearching) {
262 List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
264 startedSearching =
true;
276 logger.log(Level.INFO,
"Instance {0}", instanceNum);
278 if ((initialized ==
false) || (context == null)) {
293 synchronized (ingestStatus) {
294 ingestStatus.remove(jobId);
303 logger.log(Level.INFO,
"Indexed files count: {0}", numIndexedFiles);
304 logger.log(Level.INFO,
"Indexed file chunks count: {0}", numIndexedChunks);
306 logger.log(Level.WARNING,
"Error executing Solr query to check number of indexed files/chunks: ", ex);
316 logger.log(Level.INFO,
"stop()");
327 textExtractors.clear();
328 textExtractors = null;
329 stringExtractor = null;
338 int text_ingested = 0;
339 int metadata_ingested = 0;
340 int strings_ingested = 0;
345 synchronized (ingestStatus) {
346 Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
347 if (ingestStatusForJob == null) {
355 case METADATA_INGESTED:
358 case STRINGS_INGESTED:
361 case SKIPPED_ERROR_TEXTEXTRACT:
364 case SKIPPED_ERROR_INDEXING:
367 case SKIPPED_ERROR_IO:
376 StringBuilder msg =
new StringBuilder();
377 msg.append(
"<table border=0><tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append(
"</td><td>").append(text_ingested).append(
"</td></tr>");
378 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append(
"</td><td>").append(strings_ingested).append(
"</td></tr>");
379 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append(
"</td><td>").append(metadata_ingested).append(
"</td></tr>");
380 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append(
"</td><td>").append(error_index).append(
"</td></tr>");
381 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append(
"</td><td>").append(error_text).append(
"</td></tr>");
382 msg.append(
"<tr><td>").append(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append(
"</td><td>").append(error_io).append(
"</td></tr>");
383 msg.append(
"</table>");
384 String indexStats = msg.toString();
385 logger.log(Level.INFO,
"Keyword Indexing Completed: {0}", indexStats);
387 if (error_index > 0) {
389 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
390 }
else if (error_io + error_text > 0) {
391 MessageNotifyUtil.
Notify.
warn(NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
392 NbBundle.getMessage(
this.getClass(),
"KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
418 TextExtractor fileExtract = null;
421 for (TextExtractor fe : textExtractors) {
422 if (fe.isSupported(aFile, detectedFormat)) {
428 if (fileExtract == null) {
429 logger.log(Level.INFO,
"No text extractor found for file id:{0}, name: {1}, detected format: {2}",
new Object[]{aFile.getId(), aFile.getName(), detectedFormat});
435 return fileExtract.index(aFile, context);
455 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file ''{0}'' (id: {1}).",
new Object[]{aFile.getName(), aFile.getId()});
459 }
catch (IngesterException ex) {
460 logger.log(Level.WARNING,
"Failed to extract strings and ingest, file '" + aFile.getName() +
"' (id: " + aFile.getId() +
").", ex);
477 for (TextExtractor extractor : textExtractors) {
478 if (extractor.isContentTypeSpecific() ==
true
479 && extractor.isSupported(aFile, detectedFormat)) {
493 private void indexFile(AbstractFile aFile,
boolean indexContent) {
496 TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
499 if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
503 extractStringsAndIndex(aFile);
507 final long size = aFile.getSize();
510 if ((indexContent ==
false || aFile.isDir() || size == 0)) {
515 ingester.ingest(aFile,
false);
517 }
catch (IngesterException ex) {
519 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
530 }
catch (TskCoreException ex) {
531 logger.log(Level.SEVERE, String.format(
"Could not detect format using fileTypeDetector for file: %s", aFile), ex);
537 if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
542 ingester.ingest(aFile,
false);
544 }
catch (IngesterException ex) {
546 logger.log(Level.WARNING,
"Unable to index meta-data for file: " + aFile.getId(), ex);
551 boolean wasTextAdded =
false;
559 if (fileType.equals(
"application/octet-stream")) {
560 extractStringsAndIndex(aFile);
563 if (!extractTextAndIndex(aFile, fileType)) {
564 logger.log(Level.WARNING,
"Text extractor not found for file. Extracting strings only. File: ''{0}'' (id:{1}).",
new Object[]{aFile.getName(), aFile.getId()});
571 }
catch (IngesterException e) {
572 logger.log(Level.INFO,
"Could not extract text with Tika, " + aFile.getId() +
", "
573 + aFile.getName(), e);
575 }
catch (Exception e) {
576 logger.log(Level.WARNING,
"Error extracting text with Tika, " + aFile.getId() +
", "
577 + aFile.getName(), e);
582 if (wasTextAdded ==
false) {
583 extractStringsAndIndex(aFile);
int queryNumIndexedFiles()
boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat)
FileTypeDetector fileTypeDetector
synchronized long decrementAndGet(long jobId)
int queryNumIndexedChunks()
List< TextExtractor > textExtractors
void tryConnect(String host, int port)
static String getIndexingServerPort()
METADATA_INGESTED
No content, so we just text_ingested metadata.
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
StringsTextExtractor stringExtractor
void startUp(IngestJobContext context)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static synchronized Server getServer()
synchronized long incrementAndGet(long jobId)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
final KeywordSearchJobSettings settings
SKIPPED_ERROR_INDEXING
File was skipped because index engine had problems.
boolean extractStringsAndIndex(AbstractFile aFile)
static synchronized SearchRunner getInstance()
int queryNumIndexedDocuments()
void postMessage(final IngestMessage message)
boolean fileIngestIsCancelled()
SKIPPED_ERROR_TEXTEXTRACT
File was skipped because of text extraction issues.
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
ProcessResult process(AbstractFile abstractFile)
static void error(String title, String message)
void indexFile(AbstractFile aFile, boolean indexContent)
static Case getCurrentCase()
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
String getFileType(AbstractFile file)
static Ingester getIngester()
static String getIndexingServerHost()
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.