Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2023 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableSet;
23 import com.google.common.io.CharSource;
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.text.ParseException;
27 import java.text.SimpleDateFormat;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.List;
33 import static java.util.Locale.US;
34 import java.util.Map;
35 import java.util.Map.Entry;
36 import java.util.Optional;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.stream.Collectors;
40 import java.util.stream.IntStream;
41 import java.util.stream.Stream;
42 import org.apache.commons.lang3.tuple.Pair;
43 import org.apache.commons.lang3.tuple.Triple;
44 import org.apache.tika.metadata.DublinCore;
45 import org.apache.tika.metadata.FileSystem;
46 import org.apache.tika.metadata.IPTC;
47 import org.apache.tika.metadata.Office;
48 import org.apache.tika.metadata.OfficeOpenXMLCore;
49 import org.apache.tika.metadata.OfficeOpenXMLExtended;
50 import org.apache.tika.metadata.PDF;
51 import org.apache.tika.metadata.Photoshop;
52 import org.apache.tika.metadata.TikaCoreProperties;
53 import org.apache.tika.metadata.XMP;
54 import org.apache.tika.metadata.XMPDM;
55 import org.apache.tika.mime.MimeTypes;
56 import org.openide.util.Lookup;
57 import org.openide.util.NbBundle;
58 import org.openide.util.NbBundle.Messages;
59 import org.openide.util.lookup.Lookups;
88 
97 @NbBundle.Messages({
98  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
99  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
100  "SolrConnectionCheck.Port=Invalid port number.",
101  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
102  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
103  "CannotRunFileTypeDetection=Unable to run file type detection."
104 })
105 public final class KeywordSearchIngestModule implements FileIngestModule {
106 
107  private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
108 
113  static final List<String> ARCHIVE_MIME_TYPES
114  = ImmutableList.of(
115  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
116  "application/x-7z-compressed", //NON-NLS
117  "application/x-ace-compressed", //NON-NLS
118  "application/x-alz-compressed", //NON-NLS
119  "application/x-arj", //NON-NLS
120  "application/vnd.ms-cab-compressed", //NON-NLS
121  "application/x-cfs-compressed", //NON-NLS
122  "application/x-dgc-compressed", //NON-NLS
123  "application/x-apple-diskimage", //NON-NLS
124  "application/x-gca-compressed", //NON-NLS
125  "application/x-dar", //NON-NLS
126  "application/x-lzx", //NON-NLS
127  "application/x-lzh", //NON-NLS
128  "application/x-rar-compressed", //NON-NLS
129  "application/x-stuffit", //NON-NLS
130  "application/x-stuffitx", //NON-NLS
131  "application/x-gtar", //NON-NLS
132  "application/x-archive", //NON-NLS
133  "application/x-executable", //NON-NLS
134  "application/x-gzip", //NON-NLS
135  "application/zip", //NON-NLS
136  "application/x-zoo", //NON-NLS
137  "application/x-cpio", //NON-NLS
138  "application/x-shar", //NON-NLS
139  "application/x-tar", //NON-NLS
140  "application/x-bzip", //NON-NLS
141  "application/x-bzip2", //NON-NLS
142  "application/x-lzip", //NON-NLS
143  "application/x-lzma", //NON-NLS
144  "application/x-lzop", //NON-NLS
145  "application/x-z", //NON-NLS
146  "application/x-compress"); //NON-NLS
147 
153  private static final Map<String, Pair<BlackboardAttribute.ATTRIBUTE_TYPE, Integer>> METADATA_TYPES_MAP = Stream.of(
155  "Last-Save-Date",
156  TikaCoreProperties.MODIFIED.getName(),
157  FileSystem.MODIFIED.getName(),
158  DublinCore.MODIFIED.getName(),
159  PDF.DOC_INFO_MODIFICATION_DATE.getName(),
160  PDF.PDFVT_MODIFIED.getName(),
161  XMP.MODIFY_DATE.getName(),
162  XMPDM.AUDIO_MOD_DATE.getName(),
163  XMPDM.METADATA_MOD_DATE.getName(),
164  XMPDM.VIDEO_MOD_DATE.getName())),
166  "Last-Author",
167  Office.LAST_AUTHOR.getName(),
168  TikaCoreProperties.MODIFIER.getName())),
170  "Creation-Date",
171  TikaCoreProperties.CREATED.getName(),
172  FileSystem.CREATED.getName(),
173  DublinCore.CREATED.getName(),
174  IPTC.DATE_CREATED.getName(),
175  Office.CREATION_DATE.getName(),
176  PDF.DOC_INFO_CREATED.getName(),
177  Photoshop.DATE_CREATED.getName(),
178  XMP.CREATE_DATE.getName())),
180  "Company",
181  DublinCore.PUBLISHER.getName(),
182  IPTC.ORGANISATION_NAME.getName(),
183  OfficeOpenXMLExtended.COMPANY.getName())),
185  "Author",
186  TikaCoreProperties.CREATOR.getName(),
187  DublinCore.CREATOR.getName(),
188  Office.INITIAL_AUTHOR.getName(),
189  Office.AUTHOR.getName(),
190  Photoshop.AUTHORS_POSITION.getName(),
191  PDF.DOC_INFO_CREATOR.getName())),
193  "Application-Name",
194  "Producer",
195  OfficeOpenXMLExtended.APPLICATION.getName(),
196  org.apache.tika.metadata.RTFMetadata.EMB_APP_VERSION.getName())),
198  "Last-Printed",
199  OfficeOpenXMLCore.LAST_PRINTED.getName())),
201  "Title",
202  DublinCore.TITLE.getName(),
203  IPTC.TITLE.getName(),
204  PDF.DOC_INFO_TITLE.getName())),
206  PDF.PDF_VERSION.getName(),
207  OfficeOpenXMLCore.VERSION.getName())))
208  .flatMap(pr -> {
209  BlackboardAttribute.ATTRIBUTE_TYPE attrType = pr.getKey();
210  List<String> keys = pr.getValue();
211  return IntStream.range(0, keys.size())
212  .mapToObj(idx -> Triple.of(keys.get(idx), attrType, idx));
213  })
214  .collect(Collectors.toMap(Triple::getLeft, trip -> Pair.of(trip.getMiddle(), trip.getRight()), (v1, v2) -> v1.getRight() < v2.getRight() ? v1 : v2));
215 
216 
217  private static final String IMAGE_MIME_TYPE_PREFIX = "image/";
218 
219  // documents where OCR is performed
220  private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
221  "application/pdf",
222  "application/msword",
223  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
224  "application/vnd.ms-powerpoint",
225  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
226  "application/vnd.ms-excel",
227  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
228  );
229 
233  enum StringsExtractOptions {
234  EXTRACT_UTF16,
235  EXTRACT_UTF8,
236  };
237 
238  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
239  private final IngestServices services = IngestServices.getInstance();
240  private Ingester ingester = null;
242 //only search images from current ingest, not images previously ingested/indexed
243  //accessed read-only by searcher thread
244 
245  private Lookup stringsExtractionContext;
247  private boolean initialized = false;
248  private long jobId;
249  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
250  private int instanceNum = 0;
251  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
253 
254  private enum IngestStatus {
255 
261  SKIPPED_ERROR_IO
262  };
263  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
264 
273  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
274  synchronized (ingestStatus) {
275  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
276  if (ingestStatusForJob == null) {
277  ingestStatusForJob = new HashMap<>();
278  ingestStatus.put(ingestJobId, ingestStatusForJob);
279  }
280  ingestStatusForJob.put(fileId, status);
281  ingestStatus.put(ingestJobId, ingestStatusForJob);
282  }
283  }
284 
286  this.settings = settings;
287  instanceNum = instanceCount.getAndIncrement();
288  }
289 
295  @Messages({
296  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
297  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
298  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
299  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
300  })
301  @Override
302  public void startUp(IngestJobContext context) throws IngestModuleException {
303  initialized = false;
304  jobId = context.getJobId();
305 
306  Server server = null;
307  if (settings.isIndexToSolrEnabled()) {
308  server = KeywordSearch.getServer();
309  if (server.coreIsOpen() == false) {
310  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
311  }
312 
313  try {
314  Index indexInfo = server.getIndexInfo();
315  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
316  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
317  }
318  } catch (NoOpenCoreException ex) {
319  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
320  }
321  }
322 
323  try {
324  fileTypeDetector = new FileTypeDetector();
326  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
327  }
328 
329  ingester = Ingester.getDefault();
330  this.context = context;
331 
332  // increment the module reference count
333  // if first instance of this module for this job then check the server and existence of keywords
334  Case openCase;
335  try {
336  openCase = Case.getCurrentCaseThrows();
337  } catch (NoCurrentCaseException ex) {
338  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
339  }
340  if (refCounter.incrementAndGet(jobId) == 1) {
341  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
342  // for multi-user cases need to verify connection to remore SOLR server
343  KeywordSearchService kwsService = new SolrSearchService();
345  int port;
346  try {
347  port = Integer.parseInt(properties.getPort());
348  } catch (NumberFormatException ex) {
349  // if there is an error parsing the port number
350  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
351  }
352  try {
353  kwsService.tryConnect(properties.getHost(), port);
354  } catch (KeywordSearchServiceException ex) {
355  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
356  }
357  } else {
358  // for single-user cases need to verify connection to local SOLR service
359  // server will be null if indexing is disabled
360  if (server != null) {
361  try {
362  if (!server.isLocalSolrRunning()) {
363  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
364  }
365  } catch (KeywordSearchModuleException ex) {
366  //this means Solr is not properly initialized
367  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
368  }
369  try {
370  // make an actual query to verify that server is responding
371  // we had cases where getStatus was OK, but the connection resulted in a 404
372  server.queryNumIndexedDocuments();
374  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
375  }
376  }
377  // check if this job has any searchable keywords
378  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
379  boolean hasKeywordsForSearch = false;
380  for (KeywordList keywordList : keywordLists) {
381  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
382  hasKeywordsForSearch = true;
383  break;
384  }
385  }
386 
387  if (!settings.isIndexToSolrEnabled()) {
388  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.SolrIndexingDisabled"),
389  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.indexingDisabled")));
390  } else {
391  if (!hasKeywordsForSearch) {
392  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
393  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
394  }
395  }
396  }
397  }
398 
399  StringsConfig stringsConfig = new StringsConfig();
400  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
401  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
402  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
403  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
404 
405  stringsExtractionContext = Lookups.fixed(stringsConfig);
406 
407  initialized = true;
408  }
409 
410  @Override
411  public ProcessResult process(AbstractFile abstractFile) {
412  if (initialized == false) //error initializing indexing/Solr
413  {
414  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
415  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
416  return ProcessResult.OK;
417  }
418 
419  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
420  //skip indexing of virtual dirs (no content, no real name) - will index children files
421  return ProcessResult.OK;
422  }
423 
424  // if ocr only is enabled and not an ocr file, return
425  Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
426 
427  String mimeType = fileTypeDetector.getMIMEType(abstractFile).trim().toLowerCase();
428 
429  if (settings.isOCREnabled()) {
430  // if ocr only and the extractor is not present or will not perform ocr on this file, continue
431  if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
432  return ProcessResult.OK;
433  }
434 
435  // if limited ocr is enabled, the extractor will use ocr, and
436  // the file would not be subject to limited ocr reading, continue
437  if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
438  && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
439  return ProcessResult.OK;
440  }
441  }
442 
443  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
444  //index meta-data only
445  if (context.fileIngestIsCancelled()) {
446  return ProcessResult.OK;
447  }
448  searchFile(extractorOpt, abstractFile, mimeType, false);
449  return ProcessResult.OK;
450  }
451 
452  //index the file and content (if the content is supported)
453  if (context.fileIngestIsCancelled()) {
454  return ProcessResult.OK;
455  }
456  searchFile(extractorOpt, abstractFile, mimeType, true);
457 
458  return ProcessResult.OK;
459  }
460 
465  @Override
466  public void shutDown() {
467  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
468 
469  if ((initialized == false) || (context == null)) {
470  return;
471  }
472 
473  if (context.fileIngestIsCancelled()) {
474  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping due to ingest cancellation", instanceNum); //NON-NLS
475  cleanup();
476  return;
477  }
478 
479  // We only need to post the summary msg from the last module per job
480  if (refCounter.decrementAndGet(jobId) == 0) {
481 
482  try {
483  InlineSearcher.makeArtifacts(context);
484  InlineSearcher.cleanup(context);
485  Ingester.getDefault().commit();
486  } catch (TskException ex) {
487  logger.log(Level.SEVERE, String.format("Failed to create search ingest artifacts for job %d", context.getJobId()), ex);
488  }
489 
490  try {
491  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
492  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
493  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
494  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
496  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
497  }
498  postIndexSummary();
499  synchronized (ingestStatus) {
500  ingestStatus.remove(jobId);
501  }
502  }
503 
504  cleanup();
505  }
506 
510  private void cleanup() {
511  stringsExtractionContext = null;
512  initialized = false;
513  }
514 
525  private boolean isLimitedOCRFile(AbstractFile aFile, String mimeType) {
526  if (OCR_DOCUMENTS.contains(mimeType)) {
527  return true;
528  }
529 
530  if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
531  return aFile.getSize() > LIMITED_OCR_SIZE_MIN
533  }
534 
535  return false;
536  }
537 
541  private void postIndexSummary() {
542  int text_ingested = 0;
543  int metadata_ingested = 0;
544  int strings_ingested = 0;
545  int error_text = 0;
546  int error_index = 0;
547  int error_io = 0;
548 
549  synchronized (ingestStatus) {
550  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
551  if (ingestStatusForJob == null) {
552  return;
553  }
554  for (IngestStatus s : ingestStatusForJob.values()) {
555  switch (s) {
556  case TEXT_INGESTED:
557  text_ingested++;
558  break;
559  case METADATA_INGESTED:
560  metadata_ingested++;
561  break;
562  case STRINGS_INGESTED:
563  strings_ingested++;
564  break;
565  case SKIPPED_ERROR_TEXTEXTRACT:
566  error_text++;
567  break;
568  case SKIPPED_ERROR_INDEXING:
569  error_index++;
570  break;
571  case SKIPPED_ERROR_IO:
572  error_io++;
573  break;
574  default:
575  ;
576  }
577  }
578  }
579 
580  StringBuilder msg = new StringBuilder();
581  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
582  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
583  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
584  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
585  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
586  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
587  msg.append("</table>"); //NON-NLS
588  String indexStats = msg.toString();
589  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
590  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
591  if (error_index > 0) {
592  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
593  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
594  } else if (error_io + error_text > 0) {
595  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
596  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
597  }
598  }
599 
600  private Optional<TextExtractor> getExtractor(AbstractFile abstractFile) {
601  ImageConfig imageConfig = new ImageConfig();
602  imageConfig.setOCREnabled(settings.isOCREnabled());
603  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
604  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
605  try {
606  return Optional.ofNullable(TextExtractorFactory.getExtractor(abstractFile, extractionContext));
608  return Optional.empty();
609  }
610  }
611 
631  private boolean extractTextAndSearch(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
632  Map<String, String> extractedMetadata) throws IngesterException {
633 
634  try {
635  if (!extractorOptional.isPresent()) {
636  return false;
637  }
638  //divide into chunks and index
639  Ingester.getDefault().search(getTikaOrTextExtractor(extractorOptional, aFile, extractedMetadata), aFile.getId(), aFile.getName(), aFile, context, true,settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
640 
641  } catch (TextExtractor.InitReaderException ex) {
642  return false;
643  } catch(Exception ex) {
644  logger.log(Level.WARNING, String.format("Failed to search file %s [id=%d]",
645  aFile.getName(), aFile.getId()), ex);
646  return false;
647  }
648 
649  return true;
650  }
651 
652  private Reader getTikaOrTextExtractor(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
653  Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
654 
655  TextExtractor extractor = extractorOptional.get();
656  Reader fileText = extractor.getReader();
657  Reader finalReader;
658  try {
659  Map<String, String> metadata = extractor.getMetadata();
660  if (!metadata.isEmpty()) {
661  // Creating the metadata artifact here causes occasional problems
662  // when indexing the text, so we save the metadata map to
663  // use after this method is complete.
664  extractedMetadata.putAll(metadata);
665  }
666  CharSource formattedMetadata = getMetaDataCharSource(metadata);
667  //Append the metadata to end of the file text
668  finalReader = CharSource.concat(new CharSource() {
669  //Wrap fileText reader for concatenation
670  @Override
671  public Reader openStream() throws IOException {
672  return fileText;
673  }
674  }, formattedMetadata).openStream();
675  } catch (IOException ex) {
676  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
677  aFile.getName(), aFile.getId()), ex);
678  //Just send file text.
679  finalReader = fileText;
680  }
681  //divide into chunks and index
682  return finalReader;
683 
684  }
685 
686  private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
687 
688  String moduleName = KeywordSearchIngestModule.class.getName();
689 
690  Collection<BlackboardAttribute> attributes = new ArrayList<>();
691  Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
692 
701  Map<BlackboardAttribute.ATTRIBUTE_TYPE, Pair<Integer, String>> intermediateMapping = new HashMap<>();
702  for (Map.Entry<String, String> entry : metadata.entrySet()) {
703  if (entry.getValue() != null) {
704  Pair<BlackboardAttribute.ATTRIBUTE_TYPE, Integer> attrPair = METADATA_TYPES_MAP.get(entry.getKey());
705  if (attrPair != null && attrPair.getKey() != null && attrPair.getValue() != null) {
706  intermediateMapping.compute(attrPair.getKey(), (k, v) -> {
707  if (v == null || v.getKey() > attrPair.getValue()) {
708  return Pair.of(attrPair.getValue(), entry.getValue());
709  } else {
710  return v;
711  }
712  });
713  }
714  }
715  }
716 
717  for (Entry<BlackboardAttribute.ATTRIBUTE_TYPE, Pair<Integer, String>> interEntry: intermediateMapping.entrySet()) {
718  BlackboardAttribute attribute = checkAttribute(interEntry.getKey(), interEntry.getValue().getValue());
719  if (attribute != null) {
720  attributes.add(attribute);
721  }
722  }
723 
724  if (!attributes.isEmpty()) {
725  try {
727  bbartifacts.add(bbart);
728  } catch (TskCoreException ex) {
729  // Log error and return to continue processing
730  logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
731  return;
732  }
733  if (!bbartifacts.isEmpty()) {
734  try {
735  Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName, jobId);
737  // Log error and return to continue processing
738  logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
739  return;
740  }
741  }
742  }
743  }
744 
753  String moduleName = KeywordSearchIngestModule.class.getName();
754  if (attrType != null && !value.isEmpty() && value.charAt(0) != ' ') {
755  if (attrType.getValueType() == BlackboardAttribute.TSK_BLACKBOARD_ATTRIBUTE_VALUE_TYPE.DATETIME) {
756  SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
757  Long metadataDateTime = Long.valueOf(0);
758  try {
759  String metadataDate = value.replaceAll("T", " ").replaceAll("Z", "");
760  Date usedDate = metadataDateFormat.parse(metadataDate);
761  metadataDateTime = usedDate.getTime() / 1000;
762  return new BlackboardAttribute(attrType, moduleName, metadataDateTime);
763  } catch (ParseException ex) {
764  // catching error and displaying date that could not be parsed then will continue on.
765  logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, attrType == null ? "<null>" : attrType.name()), ex); //NON-NLS
766  return null;
767  }
768  } else {
769  return new BlackboardAttribute(attrType, moduleName, value);
770  }
771  }
772 
773  return null;
774 
775  }
776 
784  @NbBundle.Messages({
785  "KeywordSearchIngestModule.metadataTitle=METADATA"
786  })
787  static CharSource getMetaDataCharSource(Map<String, String> metadata) {
788  return CharSource.wrap(new StringBuilder(
789  String.format("\n\n------------------------------%s------------------------------\n\n",
790  Bundle.KeywordSearchIngestModule_metadataTitle()))
791  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
792  .map(entry -> entry.getKey() + ": " + entry.getValue())
793  .collect(Collectors.joining("\n"))
794  ));
795  }
796 
804  private boolean extractStringsAndIndex(AbstractFile aFile) {
805  try {
806  if (context.fileIngestIsCancelled()) {
807  return true;
808  }
809  Reader extractedTextReader = KeywordSearchUtil.getReader(aFile, stringsExtractionContext);
810  Ingester.getDefault().search(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context, false, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
811  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
812  } catch (Exception ex) {
813  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
814  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
815  return false;
816  }
817  return true;
818  }
819 
830  private void searchFile(Optional<TextExtractor> extractor, AbstractFile aFile, String mimeType, boolean indexContent) {
831  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
832 
833  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
834 
843  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
844  if (context.fileIngestIsCancelled()) {
845  return;
846  }
847  extractStringsAndIndex(aFile);
848  return;
849  }
850 
851  final long size = aFile.getSize();
852  //if not to index content, or a dir, or 0 content, index meta data only
853 
854  if ((indexContent == false || aFile.isDir() || size == 0)) {
855  try {
856  if (context.fileIngestIsCancelled()) {
857  return;
858  }
859  ingester.indexMetaDataOnly(aFile);
860  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
861  } catch (IngesterException ex) {
862  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
863  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
864  }
865  return;
866  }
867 
868  if (context.fileIngestIsCancelled()) {
869  return;
870  }
871 
872  // we skip archive formats that are opened by the archive module.
873  // @@@ We could have a check here to see if the archive module was enabled though...
874  if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
875  try {
876  if (context.fileIngestIsCancelled()) {
877  return;
878  }
879  ingester.indexMetaDataOnly(aFile);
880  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
881  } catch (IngesterException ex) {
882  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
883  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
884  }
885  return;
886  }
887 
888  boolean wasTextAdded = false;
889  Map<String, String> extractedMetadata = new HashMap<>();
890 
891  //extract text with one of the extractors, divide into chunks and index with Solr
892  try {
893  //logger.log(Level.INFO, "indexing: " + aFile.getName());
894  if (context.fileIngestIsCancelled()) {
895  return;
896  }
897  if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
898  extractStringsAndIndex(aFile);
899  return;
900  }
901  if (!extractTextAndSearch(extractor, aFile, extractedMetadata)) {
902  // Text extractor not found for file. Extract string only.
903  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
904  } else {
905  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
906  wasTextAdded = true;
907  }
908 
909  } catch (IngesterException e) {
910  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
911  + aFile.getName(), e);
912  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
913  } catch (Exception e) {
914  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
915  + aFile.getName(), e);
916  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
917  }
918 
919  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
920  //Carved Files should be the only type of unallocated files capable of a txt extension and
921  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
922  wasTextAdded = searchTextFile(aFile);
923  }
924 
925  // if it wasn't supported or had an error, default to strings
926  if (wasTextAdded == false) {
927  extractStringsAndIndex(aFile);
928  }
929 
930  // Now that the indexing is complete, create the metadata artifact (if applicable).
931  // It is unclear why calling this from extractTextAndIndex() generates
932  // errors.
933  if (!extractedMetadata.isEmpty()) {
934  createMetadataArtifact(aFile, extractedMetadata);
935  }
936  }
937 
944  private boolean searchTextFile(AbstractFile aFile) {
945  try {
946  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
947  Reader textReader = textFileExtractor.getReader();
948  if (textReader == null) {
949  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
950  } else {
951  Ingester.getDefault().search(textReader, aFile.getId(), aFile.getName(), aFile, context, true, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
952  textReader.close();
953  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
954  return true;
955  }
956  } catch (Exception ex) {
957  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
958  }
959  return false;
960  }
961 
962 }
BlackboardAttribute checkAttribute(BlackboardAttribute.ATTRIBUTE_TYPE attrType, String value)
void postArtifacts(Collection< BlackboardArtifact > artifacts, String moduleName)
Reader getTikaOrTextExtractor(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:1371
TskData.TSK_DB_FILES_TYPE_ENUM getType()
void searchFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
boolean extractTextAndSearch(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
DataArtifact newDataArtifact(BlackboardArtifact.Type artifactType, Collection< BlackboardAttribute > attributesList)
static TextExtractor getExtractor(Content content, Lookup context)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2024 Sleuth Kit Labs. Generated on: Mon Mar 17 2025
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.