Autopsy  4.20.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2023 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.collect.ImmutableMap;
23 import com.google.common.collect.ImmutableSet;
24 import com.google.common.io.CharSource;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.text.ParseException;
28 import java.text.SimpleDateFormat;
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.Date;
32 import java.util.HashMap;
33 import java.util.List;
34 import static java.util.Locale.US;
35 import java.util.Map;
36 import java.util.Optional;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.stream.Collectors;
40 import org.apache.tika.mime.MimeTypes;
41 import org.openide.util.Lookup;
42 import org.openide.util.NbBundle;
43 import org.openide.util.NbBundle.Messages;
44 import org.openide.util.lookup.Lookups;
65 import org.sleuthkit.datamodel.AbstractFile;
66 import org.sleuthkit.datamodel.Blackboard;
67 import org.sleuthkit.datamodel.BlackboardArtifact;
68 import org.sleuthkit.datamodel.BlackboardAttribute;
69 import org.sleuthkit.datamodel.TskCoreException;
70 import org.sleuthkit.datamodel.TskData;
71 import org.sleuthkit.datamodel.TskData.FileKnown;
72 import org.sleuthkit.datamodel.TskException;
73 
82 @NbBundle.Messages({
83  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
84  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
85  "SolrConnectionCheck.Port=Invalid port number.",
86  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
87  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
88  "CannotRunFileTypeDetection=Unable to run file type detection."
89 })
90 public final class KeywordSearchIngestModule implements FileIngestModule {
91 
92  private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
93 
98  static final List<String> ARCHIVE_MIME_TYPES
99  = ImmutableList.of(
100  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
101  "application/x-7z-compressed", //NON-NLS
102  "application/x-ace-compressed", //NON-NLS
103  "application/x-alz-compressed", //NON-NLS
104  "application/x-arj", //NON-NLS
105  "application/vnd.ms-cab-compressed", //NON-NLS
106  "application/x-cfs-compressed", //NON-NLS
107  "application/x-dgc-compressed", //NON-NLS
108  "application/x-apple-diskimage", //NON-NLS
109  "application/x-gca-compressed", //NON-NLS
110  "application/x-dar", //NON-NLS
111  "application/x-lzx", //NON-NLS
112  "application/x-lzh", //NON-NLS
113  "application/x-rar-compressed", //NON-NLS
114  "application/x-stuffit", //NON-NLS
115  "application/x-stuffitx", //NON-NLS
116  "application/x-gtar", //NON-NLS
117  "application/x-archive", //NON-NLS
118  "application/x-executable", //NON-NLS
119  "application/x-gzip", //NON-NLS
120  "application/zip", //NON-NLS
121  "application/x-zoo", //NON-NLS
122  "application/x-cpio", //NON-NLS
123  "application/x-shar", //NON-NLS
124  "application/x-tar", //NON-NLS
125  "application/x-bzip", //NON-NLS
126  "application/x-bzip2", //NON-NLS
127  "application/x-lzip", //NON-NLS
128  "application/x-lzma", //NON-NLS
129  "application/x-lzop", //NON-NLS
130  "application/x-z", //NON-NLS
131  "application/x-compress"); //NON-NLS
132 
133  private static final List<String> METADATA_DATE_TYPES
134  = ImmutableList.of(
135  "Last-Save-Date", //NON-NLS
136  "Last-Printed", //NON-NLS
137  "Creation-Date"); //NON-NLS
138 
139  private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
140  .put("Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
141  .put("Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
142  .put("Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
143  .put("Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
144  .put("Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
145  .put("Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
146  .put("Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
147  .put("Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
148  .put("Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
149  .put("pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
150  .build();
151 
152  private static final String IMAGE_MIME_TYPE_PREFIX = "image/";
153 
154  // documents where OCR is performed
155  private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
156  "application/pdf",
157  "application/msword",
158  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
159  "application/vnd.ms-powerpoint",
160  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
161  "application/vnd.ms-excel",
162  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
163  );
164 
168  enum StringsExtractOptions {
169  EXTRACT_UTF16,
170  EXTRACT_UTF8,
171  };
172 
173  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
174  private final IngestServices services = IngestServices.getInstance();
175  private Ingester ingester = null;
177 //only search images from current ingest, not images previously ingested/indexed
178  //accessed read-only by searcher thread
179 
180  private Lookup stringsExtractionContext;
182  private boolean initialized = false;
183  private long jobId;
184  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
185  private int instanceNum = 0;
186  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
188 
189  private enum IngestStatus {
190 
196  SKIPPED_ERROR_IO
197  };
198  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
199 
208  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
209  synchronized (ingestStatus) {
210  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
211  if (ingestStatusForJob == null) {
212  ingestStatusForJob = new HashMap<>();
213  ingestStatus.put(ingestJobId, ingestStatusForJob);
214  }
215  ingestStatusForJob.put(fileId, status);
216  ingestStatus.put(ingestJobId, ingestStatusForJob);
217  }
218  }
219 
221  this.settings = settings;
222  instanceNum = instanceCount.getAndIncrement();
223  }
224 
230  @Messages({
231  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
232  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
233  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
234  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
235  })
236  @Override
237  public void startUp(IngestJobContext context) throws IngestModuleException {
238  initialized = false;
239  jobId = context.getJobId();
240 
241  Server server = null;
242  if (settings.isIndexToSolrEnabled()) {
243  server = KeywordSearch.getServer();
244  if (server.coreIsOpen() == false) {
245  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
246  }
247 
248  try {
249  Index indexInfo = server.getIndexInfo();
250  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
251  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
252  }
253  } catch (NoOpenCoreException ex) {
254  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
255  }
256  }
257 
258  try {
259  fileTypeDetector = new FileTypeDetector();
261  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
262  }
263 
264  ingester = Ingester.getDefault();
265  this.context = context;
266 
267  // increment the module reference count
268  // if first instance of this module for this job then check the server and existence of keywords
269  Case openCase;
270  try {
271  openCase = Case.getCurrentCaseThrows();
272  } catch (NoCurrentCaseException ex) {
273  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
274  }
275  if (refCounter.incrementAndGet(jobId) == 1) {
276  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
277  // for multi-user cases need to verify connection to remore SOLR server
278  KeywordSearchService kwsService = new SolrSearchService();
280  int port;
281  try {
282  port = Integer.parseInt(properties.getPort());
283  } catch (NumberFormatException ex) {
284  // if there is an error parsing the port number
285  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
286  }
287  try {
288  kwsService.tryConnect(properties.getHost(), port);
289  } catch (KeywordSearchServiceException ex) {
290  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
291  }
292  } else {
293  // for single-user cases need to verify connection to local SOLR service
294  // server will be null if indexing is disabled
295  if (server != null) {
296  try {
297  if (!server.isLocalSolrRunning()) {
298  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
299  }
300  } catch (KeywordSearchModuleException ex) {
301  //this means Solr is not properly initialized
302  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
303  }
304  try {
305  // make an actual query to verify that server is responding
306  // we had cases where getStatus was OK, but the connection resulted in a 404
307  server.queryNumIndexedDocuments();
309  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
310  }
311  }
312  // check if this job has any searchable keywords
313  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
314  boolean hasKeywordsForSearch = false;
315  for (KeywordList keywordList : keywordLists) {
316  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
317  hasKeywordsForSearch = true;
318  break;
319  }
320  }
321 
322  if (!settings.isIndexToSolrEnabled()) {
323  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.SolrIndexingDisabled"),
324  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.indexingDisabled")));
325  } else {
326  if (!hasKeywordsForSearch) {
327  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
328  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
329  }
330  }
331  }
332  }
333 
334  StringsConfig stringsConfig = new StringsConfig();
335  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
336  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
337  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
338  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
339 
340  stringsExtractionContext = Lookups.fixed(stringsConfig);
341 
342  initialized = true;
343  }
344 
345  @Override
346  public ProcessResult process(AbstractFile abstractFile) {
347  if (initialized == false) //error initializing indexing/Solr
348  {
349  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
350  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
351  return ProcessResult.OK;
352  }
353 
354  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
355  //skip indexing of virtual dirs (no content, no real name) - will index children files
356  return ProcessResult.OK;
357  }
358 
359  // if ocr only is enabled and not an ocr file, return
360  Optional<TextExtractor> extractorOpt = getExtractor(abstractFile);
361 
362  String mimeType = fileTypeDetector.getMIMEType(abstractFile).trim().toLowerCase();
363 
364  if (settings.isOCREnabled()) {
365  // if ocr only and the extractor is not present or will not perform ocr on this file, continue
366  if (settings.isOCROnly() && (!extractorOpt.isPresent() || !extractorOpt.get().willUseOCR())) {
367  return ProcessResult.OK;
368  }
369 
370  // if limited ocr is enabled, the extractor will use ocr, and
371  // the file would not be subject to limited ocr reading, continue
372  if (settings.isLimitedOCREnabled() && extractorOpt.isPresent()
373  && extractorOpt.get().willUseOCR() && !isLimitedOCRFile(abstractFile, mimeType)) {
374  return ProcessResult.OK;
375  }
376  }
377 
378  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
379  //index meta-data only
380  if (context.fileIngestIsCancelled()) {
381  return ProcessResult.OK;
382  }
383  searchFile(extractorOpt, abstractFile, mimeType, false);
384  return ProcessResult.OK;
385  }
386 
387  //index the file and content (if the content is supported)
388  if (context.fileIngestIsCancelled()) {
389  return ProcessResult.OK;
390  }
391  searchFile(extractorOpt, abstractFile, mimeType, true);
392 
393  return ProcessResult.OK;
394  }
395 
400  @Override
401  public void shutDown() {
402  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
403 
404  if ((initialized == false) || (context == null)) {
405  return;
406  }
407 
408  if (context.fileIngestIsCancelled()) {
409  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping due to ingest cancellation", instanceNum); //NON-NLS
410  cleanup();
411  return;
412  }
413 
414  // We only need to post the summary msg from the last module per job
415  if (refCounter.decrementAndGet(jobId) == 0) {
416 
417  try {
418  InlineSearcher.makeArtifacts(context);
419  InlineSearcher.cleanup(context);
420  Ingester.getDefault().commit();
421  } catch (TskException ex) {
422  logger.log(Level.SEVERE, String.format("Failed to create search ingest artifacts for job %d", context.getJobId()), ex);
423  }
424 
425  try {
426  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
427  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
428  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
429  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
431  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
432  }
433  postIndexSummary();
434  synchronized (ingestStatus) {
435  ingestStatus.remove(jobId);
436  }
437  }
438 
439  cleanup();
440  }
441 
445  private void cleanup() {
446  stringsExtractionContext = null;
447  initialized = false;
448  }
449 
460  private boolean isLimitedOCRFile(AbstractFile aFile, String mimeType) {
461  if (OCR_DOCUMENTS.contains(mimeType)) {
462  return true;
463  }
464 
465  if (mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
466  return aFile.getSize() > LIMITED_OCR_SIZE_MIN
467  || aFile.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
468  }
469 
470  return false;
471  }
472 
476  private void postIndexSummary() {
477  int text_ingested = 0;
478  int metadata_ingested = 0;
479  int strings_ingested = 0;
480  int error_text = 0;
481  int error_index = 0;
482  int error_io = 0;
483 
484  synchronized (ingestStatus) {
485  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
486  if (ingestStatusForJob == null) {
487  return;
488  }
489  for (IngestStatus s : ingestStatusForJob.values()) {
490  switch (s) {
491  case TEXT_INGESTED:
492  text_ingested++;
493  break;
494  case METADATA_INGESTED:
495  metadata_ingested++;
496  break;
497  case STRINGS_INGESTED:
498  strings_ingested++;
499  break;
500  case SKIPPED_ERROR_TEXTEXTRACT:
501  error_text++;
502  break;
503  case SKIPPED_ERROR_INDEXING:
504  error_index++;
505  break;
506  case SKIPPED_ERROR_IO:
507  error_io++;
508  break;
509  default:
510  ;
511  }
512  }
513  }
514 
515  StringBuilder msg = new StringBuilder();
516  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
517  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
518  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
519  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
520  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
521  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
522  msg.append("</table>"); //NON-NLS
523  String indexStats = msg.toString();
524  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
525  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
526  if (error_index > 0) {
527  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
528  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
529  } else if (error_io + error_text > 0) {
530  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
531  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
532  }
533  }
534 
535  private Optional<TextExtractor> getExtractor(AbstractFile abstractFile) {
536  ImageConfig imageConfig = new ImageConfig();
537  imageConfig.setOCREnabled(settings.isOCREnabled());
538  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
539  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
540  try {
541  return Optional.ofNullable(TextExtractorFactory.getExtractor(abstractFile, extractionContext));
543  return Optional.empty();
544  }
545  }
546 
566  private boolean extractTextAndSearch(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
567  Map<String, String> extractedMetadata) throws IngesterException {
568 
569  try {
570  if (!extractorOptional.isPresent()) {
571  return false;
572  }
573  //divide into chunks and index
574  Ingester.getDefault().search(getTikaOrTextExtractor(extractorOptional, aFile, extractedMetadata), aFile.getId(), aFile.getName(), aFile, context, true,settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
575 
576  } catch (TextExtractor.InitReaderException ex) {
577  return false;
578  } catch(Exception ex) {
579  logger.log(Level.WARNING, String.format("Failed to search file %s [id=%d]",
580  aFile.getName(), aFile.getId()), ex);
581  return false;
582  }
583 
584  return true;
585  }
586 
587  private Reader getTikaOrTextExtractor(Optional<TextExtractor> extractorOptional, AbstractFile aFile,
588  Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
589 
590  TextExtractor extractor = extractorOptional.get();
591  Reader fileText = extractor.getReader();
592  Reader finalReader;
593  try {
594  Map<String, String> metadata = extractor.getMetadata();
595  if (!metadata.isEmpty()) {
596  // Creating the metadata artifact here causes occasional problems
597  // when indexing the text, so we save the metadata map to
598  // use after this method is complete.
599  extractedMetadata.putAll(metadata);
600  }
601  CharSource formattedMetadata = getMetaDataCharSource(metadata);
602  //Append the metadata to end of the file text
603  finalReader = CharSource.concat(new CharSource() {
604  //Wrap fileText reader for concatenation
605  @Override
606  public Reader openStream() throws IOException {
607  return fileText;
608  }
609  }, formattedMetadata).openStream();
610  } catch (IOException ex) {
611  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
612  aFile.getName(), aFile.getId()), ex);
613  //Just send file text.
614  finalReader = fileText;
615  }
616  //divide into chunks and index
617  return finalReader;
618 
619  }
620 
621  private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
622 
623  String moduleName = KeywordSearchIngestModule.class.getName();
624 
625  Collection<BlackboardAttribute> attributes = new ArrayList<>();
626  Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
627  for (Map.Entry<String, String> entry : metadata.entrySet()) {
628  if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
629  BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
630  if (bba != null) {
631  attributes.add(bba);
632  }
633  }
634  }
635  if (!attributes.isEmpty()) {
636  try {
637  BlackboardArtifact bbart = aFile.newDataArtifact(new BlackboardArtifact.Type(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA), attributes);
638  bbartifacts.add(bbart);
639  } catch (TskCoreException ex) {
640  // Log error and return to continue processing
641  logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
642  return;
643  }
644  if (!bbartifacts.isEmpty()) {
645  try {
646  Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName, jobId);
647  } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
648  // Log error and return to continue processing
649  logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
650  return;
651  }
652  }
653  }
654  }
655 
656  private BlackboardAttribute checkAttribute(String key, String value) {
657  String moduleName = KeywordSearchIngestModule.class.getName();
658  if (!value.isEmpty() && value.charAt(0) != ' ') {
659  if (METADATA_DATE_TYPES.contains(key)) {
660  SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
661  Long metadataDateTime = Long.valueOf(0);
662  try {
663  String metadataDate = value.replaceAll("T", " ").replaceAll("Z", "");
664  Date usedDate = metadataDateFormat.parse(metadataDate);
665  metadataDateTime = usedDate.getTime() / 1000;
666  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
667  } catch (ParseException ex) {
668  // catching error and displaying date that could not be parsed then will continue on.
669  logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, key), ex); //NON-NLS
670  return null;
671  }
672  } else {
673  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
674  }
675  }
676 
677  return null;
678 
679  }
680 
688  @NbBundle.Messages({
689  "KeywordSearchIngestModule.metadataTitle=METADATA"
690  })
691  static CharSource getMetaDataCharSource(Map<String, String> metadata) {
692  return CharSource.wrap(new StringBuilder(
693  String.format("\n\n------------------------------%s------------------------------\n\n",
694  Bundle.KeywordSearchIngestModule_metadataTitle()))
695  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
696  .map(entry -> entry.getKey() + ": " + entry.getValue())
697  .collect(Collectors.joining("\n"))
698  ));
699  }
700 
708  private boolean extractStringsAndIndex(AbstractFile aFile) {
709  try {
710  if (context.fileIngestIsCancelled()) {
711  return true;
712  }
713  Reader extractedTextReader = KeywordSearchUtil.getReader(aFile, stringsExtractionContext);
714  Ingester.getDefault().search(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context, false, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
715  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
716  } catch (Exception ex) {
717  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
718  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
719  return false;
720  }
721  return true;
722  }
723 
734  private void searchFile(Optional<TextExtractor> extractor, AbstractFile aFile, String mimeType, boolean indexContent) {
735  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
736 
737  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
738 
745  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
746  || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
747  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
748  if (context.fileIngestIsCancelled()) {
749  return;
750  }
751  extractStringsAndIndex(aFile);
752  return;
753  }
754 
755  final long size = aFile.getSize();
756  //if not to index content, or a dir, or 0 content, index meta data only
757 
758  if ((indexContent == false || aFile.isDir() || size == 0)) {
759  try {
760  if (context.fileIngestIsCancelled()) {
761  return;
762  }
763  ingester.indexMetaDataOnly(aFile);
764  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
765  } catch (IngesterException ex) {
766  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
767  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
768  }
769  return;
770  }
771 
772  if (context.fileIngestIsCancelled()) {
773  return;
774  }
775 
776  // we skip archive formats that are opened by the archive module.
777  // @@@ We could have a check here to see if the archive module was enabled though...
778  if (ARCHIVE_MIME_TYPES.contains(mimeType)) {
779  try {
780  if (context.fileIngestIsCancelled()) {
781  return;
782  }
783  ingester.indexMetaDataOnly(aFile);
784  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
785  } catch (IngesterException ex) {
786  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
787  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
788  }
789  return;
790  }
791 
792  boolean wasTextAdded = false;
793  Map<String, String> extractedMetadata = new HashMap<>();
794 
795  //extract text with one of the extractors, divide into chunks and index with Solr
796  try {
797  //logger.log(Level.INFO, "indexing: " + aFile.getName());
798  if (context.fileIngestIsCancelled()) {
799  return;
800  }
801  if (MimeTypes.OCTET_STREAM.equals(mimeType)) {
802  extractStringsAndIndex(aFile);
803  return;
804  }
805  if (!extractTextAndSearch(extractor, aFile, extractedMetadata)) {
806  // Text extractor not found for file. Extract string only.
807  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
808  } else {
809  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
810  wasTextAdded = true;
811  }
812 
813  } catch (IngesterException e) {
814  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
815  + aFile.getName(), e);
816  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
817  } catch (Exception e) {
818  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
819  + aFile.getName(), e);
820  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
821  }
822 
823  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
824  //Carved Files should be the only type of unallocated files capable of a txt extension and
825  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
826  wasTextAdded = searchTextFile(aFile);
827  }
828 
829  // if it wasn't supported or had an error, default to strings
830  if (wasTextAdded == false) {
831  extractStringsAndIndex(aFile);
832  }
833 
834  // Now that the indexing is complete, create the metadata artifact (if applicable).
835  // It is unclear why calling this from extractTextAndIndex() generates
836  // errors.
837  if (!extractedMetadata.isEmpty()) {
838  createMetadataArtifact(aFile, extractedMetadata);
839  }
840  }
841 
848  private boolean searchTextFile(AbstractFile aFile) {
849  try {
850  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
851  Reader textReader = textFileExtractor.getReader();
852  if (textReader == null) {
853  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
854  } else {
855  Ingester.getDefault().search(textReader, aFile.getId(), aFile.getName(), aFile, context, true, settings.isIndexToSolrEnabled(), settings.getNamesOfEnabledKeyWordLists());
856  textReader.close();
857  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
858  return true;
859  }
860  } catch (Exception ex) {
861  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
862  }
863  return false;
864  }
865 
866 }
Reader getTikaOrTextExtractor(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:1367
void searchFile(Optional< TextExtractor > extractor, AbstractFile aFile, String mimeType, boolean indexContent)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
boolean extractTextAndSearch(Optional< TextExtractor > extractorOptional, AbstractFile aFile, Map< String, String > extractedMetadata)
static TextExtractor getExtractor(Content content, Lookup context)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
Optional< TextExtractor > getExtractor(AbstractFile abstractFile)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2022 Basis Technology. Generated on: Tue Aug 1 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.