Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2023 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
21 import;
22 import;
23 import;
24 import;
25 import;
26 import;
27 import java.text.ParseException;
28 import java.text.SimpleDateFormat;
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.Date;
32 import java.util.HashMap;
33 import java.util.List;
34 import static java.util.Locale.US;
35 import java.util.Map;
36 import java.util.Optional;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import;
40 import org.apache.tika.mime.MimeTypes;
41 import org.openide.util.Lookup;
42 import org.openide.util.NbBundle;
43 import org.openide.util.NbBundle.Messages;
44 import org.openide.util.lookup.Lookups;
65 import org.sleuthkit.datamodel.AbstractFile;
66 import org.sleuthkit.datamodel.Blackboard;
67 import org.sleuthkit.datamodel.BlackboardArtifact;
68 import org.sleuthkit.datamodel.BlackboardAttribute;
69 import org.sleuthkit.datamodel.TskCoreException;
70 import org.sleuthkit.datamodel.TskData;
71 import org.sleuthkit.datamodel.TskData.FileKnown;
72 import org.sleuthkit.datamodel.TskException;
82 @NbBundle.Messages({
83  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
84  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
85  "SolrConnectionCheck.Port=Invalid port number.",
86  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
87  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
88  "CannotRunFileTypeDetection=Unable to run file type detection."
89 })
90 public final class KeywordSearchIngestModule implements FileIngestModule {
92  private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
98  static final List<String> ARCHIVE_MIME_TYPES
99  = ImmutableList.of(
100  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
101  "application/x-7z-compressed", //NON-NLS
102  "application/x-ace-compressed", //NON-NLS
103  "application/x-alz-compressed", //NON-NLS
104  "application/x-arj", //NON-NLS
105  "application/", //NON-NLS
106  "application/x-cfs-compressed", //NON-NLS
107  "application/x-dgc-compressed", //NON-NLS
108  "application/x-apple-diskimage", //NON-NLS
109  "application/x-gca-compressed", //NON-NLS
110  "application/x-dar", //NON-NLS
111  "application/x-lzx", //NON-NLS
112  "application/x-lzh", //NON-NLS
113  "application/x-rar-compressed", //NON-NLS
114  "application/x-stuffit", //NON-NLS
115  "application/x-stuffitx", //NON-NLS
116  "application/x-gtar", //NON-NLS
117  "application/x-archive", //NON-NLS
118  "application/x-executable", //NON-NLS
119  "application/x-gzip", //NON-NLS
120  "application/zip", //NON-NLS
121  "application/x-zoo", //NON-NLS
122  "application/x-cpio", //NON-NLS
123  "application/x-shar", //NON-NLS
124  "application/x-tar", //NON-NLS
125  "application/x-bzip", //NON-NLS
126  "application/x-bzip2", //NON-NLS
127  "application/x-lzip", //NON-NLS
128  "application/x-lzma", //NON-NLS
129  "application/x-lzop", //NON-NLS
130  "application/x-z", //NON-NLS
131  "application/x-compress"); //NON-NLS
133  private static final List<String> METADATA_DATE_TYPES
134  = ImmutableList.of(
135  "Last-Save-Date", //NON-NLS
136  "Last-Printed", //NON-NLS
137  "Creation-Date"); //NON-NLS
139  private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
140  .put("Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
141  .put("Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
142  .put("Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
143  .put("Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
144  .put("Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
145  .put("Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
146  .put("Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
147  .put("Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
148  .put("Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
149  .put("pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
150  .build();
152  private static final String IMAGE_MIME_TYPE_PREFIX = "image/";
154  // documents where OCR is performed
155  private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
156  "application/pdf",
157  "application/msword",
158  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
159  "application/",
160  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
161  "application/",
162  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
163  );
168  enum StringsExtractOptions {
171  };
173  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
174  private final IngestServices services = IngestServices.getInstance();
175  private Ingester ingester = null;
177 //only search images from current ingest, not images previously ingested/indexed
178  //accessed read-only by searcher thread
180  private Lookup stringsExtractionContext;
182  private boolean initialized = false;
183  private long jobId;
184  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
185  private int instanceNum = 0;
186  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
189  private enum IngestStatus {
197  };
198  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
208  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
209  synchronized (ingestStatus) {
210  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
211  if (ingestStatusForJob == null) {
212  ingestStatusForJob = new HashMap<>();
213  ingestStatus.put(ingestJobId, ingestStatusForJob);
214  }
215  ingestStatusForJob.put(fileId, status);
216  ingestStatus.put(ingestJobId, ingestStatusForJob);
217  }
218  }
221  this.settings = settings;
222  instanceNum = instanceCount.getAndIncrement();
223  }
230  @Messages({
231  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
232  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
233  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
234  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
235  })
236  @Override
237  public void startUp(IngestJobContext context) throws IngestModuleException {
238  initialized = false;
239  jobId = context.getJobId();
241  Server server = null;
242  if (settings.isIndexToSolrEnabled()) {
243  server = KeywordSearch.getServer();
244  if (server.coreIsOpen() == false) {
245  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
246  }
248  try {
249  Index indexInfo = server.getIndexInfo();
250  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
251  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
252  }
253  } catch (NoOpenCoreException ex) {
254  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
255  }
256  }
258  try {
259  fileTypeDetector = new FileTypeDetector();
261  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
262  }
264  ingester = Ingester.getDefault();
265  this.context = context;
267  // increment the module reference count
268  // if first instance of this module for this job then check the server and existence of keywords
269  Case openCase;
270  try {
271  openCase = Case.getCurrentCaseThrows();
272  } catch (NoCurrentCaseException ex) {
273  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
274  }
275  if (refCounter.incrementAndGet(jobId) == 1) {
276  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
277  // for multi-user cases need to verify connection to remore SOLR server
278  KeywordSearchService kwsService = new SolrSearchService();
280  int port;
281  try {
282  port = Integer.parseInt(properties.getPort());
283  } catch (NumberFormatException ex) {
284  // if there is an error parsing the port number
285  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
286  }
287  try {
288  kwsService.tryConnect(properties.getHost(), port);
289  } catch (KeywordSearchServiceException ex) {
290  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
291  }
292  } else {
293  // for single-user cases need to verify connection to local SOLR service
294  // server will be null if indexing is disabled
295  if (server != null) {
296  try {
297  if (!server.isLocalSolrRunning()) {
298  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
299  }
300  } catch (KeywordSearchModuleException ex) {
301  //this means Solr is not properly initialized
302  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
303  }
304  try {
305  // make an actual query to verify that server is responding
306  // we had cases where getStatus was OK, but the connection resulted in a 404
307  server.queryNumIndexedDocuments();
309  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
310  }
311  }
312  // check if this job has any searchable keywords
313  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
314  boolean hasKeywordsForSearch = false;
315  for (KeywordList keywordList : keywordLists) {
316  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
317  hasKeywordsForSearch = true;
318  break;
319  }
320  }
322  if (!settings.isIndexToSolrEnabled()) {
323  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.SolrIndexingDisabled"),
324  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.indexingDisabled")));
325  } else {
326  if (!hasKeywordsForSearch) {
327  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
328  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
329  }
330  }
331  }
332  }
334  StringsConfig stringsConfig = new StringsConfig();
335  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
336  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
337  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
338  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
340  stringsExtractionContext = Lookups.fixed(stringsConfig);
342  initialized = true;
343  }
345  @Override
346  public ProcessResult process(AbstractFile abstractFile) {
347  if (initialized == false) //error initializing indexing/Solr
348  {
349  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
350  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
351  return ProcessResult.OK;
352  }
354  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
355  //skip indexing of virtual dirs (no content, no real name) - will index children files
356  return ProcessResult.OK;
357  }
359  // if ocr only is enabled and not an ocr file, return
360  Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
362  String mimeType = fileTypeDetector.getMIMEType(abstractFile).trim().toLowerCase();
364  if (settings.isOCREnabled()) {
365  // if ocr only and the extractor is not present or will not perform ocr on this file, continue
366  if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
367  return ProcessResult.OK;
368  }
370  // if limited ocr is enabled, the extractor will use ocr, and
371  // the file would not be subject to limited ocr reading, continue
372  if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
373  && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
374  return ProcessResult.OK;
375  }
376  }
378  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
379  //index meta-data only
380  if (context.fileIngestIsCancelled()) {
381  return ProcessResult.OK;
382  }
383  searchFile(extractorOpt, abstractFile, mimeType, false);
384  return ProcessResult.OK;
385  }
387  //index the file and content (if the content is supported)
388  if (context.fileIngestIsCancelled()) {
389  return ProcessResult.OK;
390  }
391  searchFile(extractorOpt, abstractFile, mimeType, true);
393  return ProcessResult.OK;
394  }
400  @Override
401  public void shutDown() {
402  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
404  if ((initialized == false) || (context == null)) {
405  return;
406  }
408  if (context.fileIngestIsCancelled()) {
409  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping due to ingest cancellation", instanceNum); //NON-NLS
410  cleanup();
411  return;
412  }
414  // We only need to post the summary msg from the last module per job
415  if (refCounter.decrementAndGet(jobId) == 0) {
417  try {
418  InlineSearcher.makeArtifacts(context);
419  InlineSearcher.cleanup(context);
420  Ingester.getDefault().commit();
421  } catch (TskException ex) {
422  logger.log(Level.SEVERE, String.format("Failed to create search ingest artifacts for job %d", context.getJobId()), ex);
423  }
425  try {
426  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
427  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
428  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
429  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
431  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
432  }
433  postIndexSummary();
434  synchronized (ingestStatus) {
435  ingestStatus.remove(jobId);
436  }
437  }
439  cleanup();
440  }
445  private void cleanup() {
446  stringsExtractionContext = null;
447  initialized = false;
448  }
460  private boolean isLimitedOCRFile(AbstractFile aFile, String mimeType) {
461  if (OCR_DOCUMENTS.contains(mimeType)) {
462  return true;
463  }
465  if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
466  return aFile.getSize() > LIMITED_OCR_SIZE_MIN
467  || aFile.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
468  }
470  return false;
471  }
476  private void postIndexSummary() {
477  int text_ingested = 0;
478  int metadata_ingested = 0;
479  int strings_ingested = 0;
480  int error_text = 0;
481  int error_index = 0;
482  int error_io = 0;
484  synchronized (ingestStatus) {
485  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
486  if (ingestStatusForJob == null) {
487  return;
488  }
489  for (IngestStatus s : ingestStatusForJob.values()) {
490  switch (s) {
491  case TEXT_INGESTED:
492  text_ingested++;
493  break;
495  metadata_ingested++;
496  break;
498  strings_ingested++;
499  break;
501  error_text++;
502  break;
504  error_index++;
505  break;
507  error_io++;
508  break;
509  default:
510  ;
511  }
512  }
513  }
515  StringBuilder msg = new StringBuilder();
516  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
517  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
518  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
519  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
520  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
521  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
522  msg.append("</table>"); //NON-NLS
523  String indexStats = msg.toString();
524  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
525  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
526  if (error_index > 0) {
527  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
528  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
529  } else if (error_io + error_text > 0) {
530  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
531  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
532  }
533  }
535  private Optional<TextExtractor> getExtractor(AbstractFile abstractFile) {
536  ImageConfig imageConfig = new ImageConfig();
537  imageConfig.setOCREnabled(settings.isOCREnabled());
538  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
539  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
540  try {
541  return Optional.ofNullable(TextExtractorFactory.getExtractor(abstractFile, extractionContext));
543  return Optional.empty();
544  }
545  }
566  private boolean extractTextAndSearch(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
567  Map<String, String> extractedMetadata) throws IngesterException {
569  try {
570  if (!extractorOptional.isPresent()) {
571  return false;
572  }
573  //divide into chunks and index
574  Ingester.getDefault().search(getTikaOrTextExtractor(extractorOptional, aFile, extractedMetadata), aFile.getId(), aFile.getName(), aFile, context, true,settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
576  } catch (TextExtractor.InitReaderException ex) {
577  return false;
578  } catch(Exception ex) {
579  logger.log(Level.WARNING, String.format("Failed to search file %s [id=%d]",
580  aFile.getName(), aFile.getId()), ex);
581  return false;
582  }
584  return true;
585  }
587  private Reader getTikaOrTextExtractor(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
588  Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
590  TextExtractor extractor = extractorOptional.get();
591  Reader fileText = extractor.getReader();
592  Reader finalReader;
593  try {
594  Map<String, String> metadata = extractor.getMetadata();
595  if (!metadata.isEmpty()) {
596  // Creating the metadata artifact here causes occasional problems
597  // when indexing the text, so we save the metadata map to
598  // use after this method is complete.
599  extractedMetadata.putAll(metadata);
600  }
601  CharSource formattedMetadata = getMetaDataCharSource(metadata);
602  //Append the metadata to end of the file text
603  finalReader = CharSource.concat(new CharSource() {
604  //Wrap fileText reader for concatenation
605  @Override
606  public Reader openStream() throws IOException {
607  return fileText;
608  }
609  }, formattedMetadata).openStream();
610  } catch (IOException ex) {
611  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
612  aFile.getName(), aFile.getId()), ex);
613  //Just send file text.
614  finalReader = fileText;
615  }
616  //divide into chunks and index
617  return finalReader;
619  }
621  private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
623  String moduleName = KeywordSearchIngestModule.class.getName();
625  Collection<BlackboardAttribute> attributes = new ArrayList<>();
626  Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
627  for (Map.Entry<String, String> entry : metadata.entrySet()) {
628  if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
629  BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
630  if (bba != null) {
631  attributes.add(bba);
632  }
633  }
634  }
635  if (!attributes.isEmpty()) {
636  try {
637  BlackboardArtifact bbart = aFile.newDataArtifact(new BlackboardArtifact.Type(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA), attributes);
638  bbartifacts.add(bbart);
639  } catch (TskCoreException ex) {
640  // Log error and return to continue processing
641  logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
642  return;
643  }
644  if (!bbartifacts.isEmpty()) {
645  try {
646  Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName, jobId);
647  } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
648  // Log error and return to continue processing
649  logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
650  return;
651  }
652  }
653  }
654  }
656  private BlackboardAttribute checkAttribute(String key, String value) {
657  String moduleName = KeywordSearchIngestModule.class.getName();
658  if (!value.isEmpty() && value.charAt(0) != ' ') {
659  if (METADATA_DATE_TYPES.contains(key)) {
660  SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
661  Long metadataDateTime = Long.valueOf(0);
662  try {
663  String metadataDate = value.replaceAll("T", " ").replaceAll("Z", "");
664  Date usedDate = metadataDateFormat.parse(metadataDate);
665  metadataDateTime = usedDate.getTime() / 1000;
666  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
667  } catch (ParseException ex) {
668  // catching error and displaying date that could not be parsed then will continue on.
669  logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, key), ex); //NON-NLS
670  return null;
671  }
672  } else {
673  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
674  }
675  }
677  return null;
679  }
688  @NbBundle.Messages({
689  "KeywordSearchIngestModule.metadataTitle=METADATA"
690  })
691  static CharSource getMetaDataCharSource(Map<String, String> metadata) {
692  return CharSource.wrap(new StringBuilder(
693  String.format("\n\n------------------------------%s------------------------------\n\n",
694  Bundle.KeywordSearchIngestModule_metadataTitle()))
695  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
696  .map(entry -> entry.getKey() + ": " + entry.getValue())
697  .collect(Collectors.joining("\n"))
698  ));
699  }
708  private boolean extractStringsAndIndex(AbstractFile aFile) {
709  try {
710  if (context.fileIngestIsCancelled()) {
711  return true;
712  }
713  Reader extractedTextReader = KeywordSearchUtil.getReader(aFile, stringsExtractionContext);
714  Ingester.getDefault().search(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context, false, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
715  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
716  } catch (Exception ex) {
717  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
718  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
719  return false;
720  }
721  return true;
722  }
734  private void searchFile(Optional<TextExtractor> extractor, AbstractFile aFile, String mimeType, boolean indexContent) {
735  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
737  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
745  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
746  || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
747  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
748  if (context.fileIngestIsCancelled()) {
749  return;
750  }
751  extractStringsAndIndex(aFile);
752  return;
753  }
755  final long size = aFile.getSize();
756  //if not to index content, or a dir, or 0 content, index meta data only
758  if ((indexContent == false || aFile.isDir() || size == 0)) {
759  try {
760  if (context.fileIngestIsCancelled()) {
761  return;
762  }
763  ingester.indexMetaDataOnly(aFile);
764  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
765  } catch (IngesterException ex) {
766  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
767  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
768  }
769  return;
770  }
772  if (context.fileIngestIsCancelled()) {
773  return;
774  }
776  // we skip archive formats that are opened by the archive module.
777  // @@@ We could have a check here to see if the archive module was enabled though...
778  if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
779  try {
780  if (context.fileIngestIsCancelled()) {
781  return;
782  }
783  ingester.indexMetaDataOnly(aFile);
784  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
785  } catch (IngesterException ex) {
786  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
787  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
788  }
789  return;
790  }
792  boolean wasTextAdded = false;
793  Map<String, String> extractedMetadata = new HashMap<>();
795  //extract text with one of the extractors, divide into chunks and index with Solr
796  try {
797  //logger.log(Level.INFO, "indexing: " + aFile.getName());
798  if (context.fileIngestIsCancelled()) {
799  return;
800  }
801  if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
802  extractStringsAndIndex(aFile);
803  return;
804  }
805  if (!extractTextAndSearch(extractor, aFile, extractedMetadata)) {
806  // Text extractor not found for file. Extract string only.
807  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
808  } else {
809  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
810  wasTextAdded = true;
811  }
813  } catch (IngesterException e) {
814  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
815  + aFile.getName(), e);
816  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
817  } catch (Exception e) {
818  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
819  + aFile.getName(), e);
820  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
821  }
823  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
824  //Carved Files should be the only type of unallocated files capable of a txt extension and
825  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
826  wasTextAdded = searchTextFile(aFile);
827  }
829  // if it wasn't supported or had an error, default to strings
830  if (wasTextAdded == false) {
831  extractStringsAndIndex(aFile);
832  }
834  // Now that the indexing is complete, create the metadata artifact (if applicable).
835  // It is unclear why calling this from extractTextAndIndex() generates
836  // errors.
837  if (!extractedMetadata.isEmpty()) {
838  createMetadataArtifact(aFile, extractedMetadata);
839  }
840  }
848  private boolean searchTextFile(AbstractFile aFile) {
849  try {
850  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
851  Reader textReader = textFileExtractor.getReader();
852  if (textReader == null) {
853  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
854  } else {
855  Ingester.getDefault().search(textReader, aFile.getId(), aFile.getName(), aFile, context, true, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
856  textReader.close();
857  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
858  return true;
859  }
860  } catch (Exception ex) {
861  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
862  }
863  return false;
864  }
866 }
Reader getTikaOrTextExtractor(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
void searchFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
boolean extractTextAndSearch(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
static TextExtractor getExtractor(Content content, Lookup context)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2022 Basis Technology. Generated on: Tue Feb 6 2024
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.