Autopsy  4.16.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
21 import;
22 import;
23 import;
24 import;
25 import;
26 import java.text.ParseException;
27 import java.text.SimpleDateFormat;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.List;
33 import static java.util.Locale.US;
34 import java.util.Map;
35 import java.util.concurrent.atomic.AtomicInteger;
36 import java.util.logging.Level;
37 import;
38 import org.apache.tika.mime.MimeTypes;
39 import org.openide.util.Lookup;
40 import org.openide.util.NbBundle;
41 import org.openide.util.NbBundle.Messages;
42 import org.openide.util.lookup.Lookups;
63 import org.sleuthkit.datamodel.AbstractFile;
64 import org.sleuthkit.datamodel.Blackboard;
65 import org.sleuthkit.datamodel.BlackboardArtifact;
66 import org.sleuthkit.datamodel.BlackboardAttribute;
67 import org.sleuthkit.datamodel.TskCoreException;
68 import org.sleuthkit.datamodel.TskData;
69 import org.sleuthkit.datamodel.TskData.FileKnown;
79 @NbBundle.Messages({
80  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
81  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
82  "SolrConnectionCheck.Port=Invalid port number.",
83  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
84  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
85  "CannotRunFileTypeDetection=Unable to run file type detection."
86 })
87 public final class KeywordSearchIngestModule implements FileIngestModule {
93  private static final List<String> ARCHIVE_MIME_TYPES
94  = ImmutableList.of(
95  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
96  "application/x-7z-compressed", //NON-NLS
97  "application/x-ace-compressed", //NON-NLS
98  "application/x-alz-compressed", //NON-NLS
99  "application/x-arj", //NON-NLS
100  "application/", //NON-NLS
101  "application/x-cfs-compressed", //NON-NLS
102  "application/x-dgc-compressed", //NON-NLS
103  "application/x-apple-diskimage", //NON-NLS
104  "application/x-gca-compressed", //NON-NLS
105  "application/x-dar", //NON-NLS
106  "application/x-lzx", //NON-NLS
107  "application/x-lzh", //NON-NLS
108  "application/x-rar-compressed", //NON-NLS
109  "application/x-stuffit", //NON-NLS
110  "application/x-stuffitx", //NON-NLS
111  "application/x-gtar", //NON-NLS
112  "application/x-archive", //NON-NLS
113  "application/x-executable", //NON-NLS
114  "application/x-gzip", //NON-NLS
115  "application/zip", //NON-NLS
116  "application/x-zoo", //NON-NLS
117  "application/x-cpio", //NON-NLS
118  "application/x-shar", //NON-NLS
119  "application/x-tar", //NON-NLS
120  "application/x-bzip", //NON-NLS
121  "application/x-bzip2", //NON-NLS
122  "application/x-lzip", //NON-NLS
123  "application/x-lzma", //NON-NLS
124  "application/x-lzop", //NON-NLS
125  "application/x-z", //NON-NLS
126  "application/x-compress"); //NON-NLS
128  private static final List<String> METADATA_DATE_TYPES
129  = ImmutableList.of(
130  "Last-Save-Date", //NON-NLS
131  "Last-Printed", //NON-NLS
132  "Creation-Date"); //NON-NLS
134  private static final Map<String, BlackboardAttribute.ATTRIBUTE_TYPE> METADATA_TYPES_MAP = ImmutableMap.<String, BlackboardAttribute.ATTRIBUTE_TYPE>builder()
135  .put("Last-Save-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_MODIFIED)
136  .put("Last-Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_USER_ID)
137  .put("Creation-Date", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_CREATED)
138  .put("Company", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ORGANIZATION)
139  .put("Author", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_OWNER)
140  .put("Application-Name", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
141  .put("Last-Printed", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_LAST_PRINTED_DATETIME)
142  .put("Producer", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME)
143  .put("Title", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DESCRIPTION)
144  .put("pdf:PDFVersion", BlackboardAttribute.ATTRIBUTE_TYPE.TSK_VERSION)
145  .build();
151  enum StringsExtractOptions {
154  };
156  enum UpdateFrequency {
158  FAST(20),
159  AVG(10),
160  SLOW(5),
161  SLOWEST(1),
162  NONE(Integer.MAX_VALUE),
163  DEFAULT(5);
164  private final int time;
166  UpdateFrequency(int time) {
167  this.time = time;
168  }
170  int getTime() {
171  return time;
172  }
173  };
174  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
175  private final IngestServices services = IngestServices.getInstance();
176  private Ingester ingester = null;
177  private Indexer indexer;
179 //only search images from current ingest, not images previously ingested/indexed
180  //accessed read-only by searcher thread
182  private boolean startedSearching = false;
183  private Lookup stringsExtractionContext;
184  private final KeywordSearchJobSettings settings;
185  private boolean initialized = false;
186  private long jobId;
187  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
188  private int instanceNum = 0;
189  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
192  private enum IngestStatus {
200  };
201  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
211  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
212  synchronized (ingestStatus) {
213  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
214  if (ingestStatusForJob == null) {
215  ingestStatusForJob = new HashMap<>();
216  ingestStatus.put(ingestJobId, ingestStatusForJob);
217  }
218  ingestStatusForJob.put(fileId, status);
219  ingestStatus.put(ingestJobId, ingestStatusForJob);
220  }
221  }
223  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
224  this.settings = settings;
225  instanceNum = instanceCount.getAndIncrement();
226  }
233  @Messages({
234  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
235  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
236  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
237  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
238  })
239  @Override
240  public void startUp(IngestJobContext context) throws IngestModuleException {
241  initialized = false;
242  jobId = context.getJobId();
244  Server server = KeywordSearch.getServer();
245  if (server.coreIsOpen() == false) {
246  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
247  }
249  try {
250  Index indexInfo = server.getIndexInfo();
251  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
252  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
253  }
254  if (!indexInfo.isCompatible(IndexFinder.getCurrentSchemaVersion())) {
255  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
256  }
257  } catch (NoOpenCoreException ex) {
258  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
259  }
261  try {
262  fileTypeDetector = new FileTypeDetector();
264  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
265  }
267  ingester = Ingester.getDefault();
268  this.context = context;
270  // increment the module reference count
271  // if first instance of this module for this job then check the server and existence of keywords
272  Case openCase;
273  try {
274  openCase = Case.getCurrentCaseThrows();
275  } catch (NoCurrentCaseException ex) {
276  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
277  }
278  if (refCounter.incrementAndGet(jobId) == 1) {
279  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
280  // for multi-user cases need to verify connection to remore SOLR server
281  KeywordSearchService kwsService = new SolrSearchService();
283  int port;
284  try {
285  port = Integer.parseInt(properties.getPort());
286  } catch (NumberFormatException ex) {
287  // if there is an error parsing the port number
288  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
289  }
290  try {
291  kwsService.tryConnect(properties.getHost(), port);
292  } catch (KeywordSearchServiceException ex) {
293  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
294  }
295  } else {
296  // for single-user cases need to verify connection to local SOLR service
297  try {
298  if (!server.isRunning()) {
299  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
300  }
301  } catch (KeywordSearchModuleException ex) {
302  //this means Solr is not properly initialized
303  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
304  }
305  try {
306  // make an actual query to verify that server is responding
307  // we had cases where getStatus was OK, but the connection resulted in a 404
308  server.queryNumIndexedDocuments();
310  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
311  }
313  // check if this job has any searchable keywords
314  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
315  boolean hasKeywordsForSearch = false;
316  for (KeywordList keywordList : keywordLists) {
317  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
318  hasKeywordsForSearch = true;
319  break;
320  }
321  }
322  if (!hasKeywordsForSearch) {
323  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
324  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
325  }
326  }
327  }
329  StringsConfig stringsConfig = new StringsConfig();
330  Map<String, String> stringsOptions = KeywordSearchSettings.getStringExtractOptions();
331  stringsConfig.setExtractUTF8(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF8.toString())));
332  stringsConfig.setExtractUTF16(Boolean.parseBoolean(stringsOptions.get(StringsExtractOptions.EXTRACT_UTF16.toString())));
333  stringsConfig.setLanguageScripts(KeywordSearchSettings.getStringExtractScripts());
335  stringsExtractionContext = Lookups.fixed(stringsConfig);
337  indexer = new Indexer();
338  initialized = true;
339  }
341  @Override
342  public ProcessResult process(AbstractFile abstractFile) {
343  if (initialized == false) //error initializing indexing/Solr
344  {
345  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
346  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
347  return ProcessResult.OK;
348  }
350  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
351  //skip indexing of virtual dirs (no content, no real name) - will index children files
352  return ProcessResult.OK;
353  }
355  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
356  //index meta-data only
357  if (context.fileIngestIsCancelled()) {
358  return ProcessResult.OK;
359  }
360  indexer.indexFile(abstractFile, false);
361  return ProcessResult.OK;
362  }
364  //index the file and content (if the content is supported)
365  if (context.fileIngestIsCancelled()) {
366  return ProcessResult.OK;
367  }
368  indexer.indexFile(abstractFile, true);
370  // Start searching if it hasn't started already
371  if (!startedSearching) {
372  if (context.fileIngestIsCancelled()) {
373  return ProcessResult.OK;
374  }
375  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
376  IngestSearchRunner.getInstance().startJob(context, keywordListNames);
377  startedSearching = true;
378  }
380  return ProcessResult.OK;
381  }
387  @Override
388  public void shutDown() {
389  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
391  if ((initialized == false) || (context == null)) {
392  return;
393  }
395  if (context.fileIngestIsCancelled()) {
396  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
397  IngestSearchRunner.getInstance().stopJob(jobId);
398  cleanup();
399  return;
400  }
402  // Remove from the search list and trigger final commit and final search
403  IngestSearchRunner.getInstance().endJob(jobId);
405  // We only need to post the summary msg from the last module per job
406  if (refCounter.decrementAndGet(jobId) == 0) {
407  try {
408  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
409  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
410  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
411  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
413  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
414  }
415  postIndexSummary();
416  synchronized (ingestStatus) {
417  ingestStatus.remove(jobId);
418  }
419  }
421  cleanup();
422  }
427  private void cleanup() {
428  stringsExtractionContext = null;
429  initialized = false;
430  }
435  private void postIndexSummary() {
436  int text_ingested = 0;
437  int metadata_ingested = 0;
438  int strings_ingested = 0;
439  int error_text = 0;
440  int error_index = 0;
441  int error_io = 0;
443  synchronized (ingestStatus) {
444  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
445  if (ingestStatusForJob == null) {
446  return;
447  }
448  for (IngestStatus s : ingestStatusForJob.values()) {
449  switch (s) {
450  case TEXT_INGESTED:
451  text_ingested++;
452  break;
454  metadata_ingested++;
455  break;
457  strings_ingested++;
458  break;
460  error_text++;
461  break;
463  error_index++;
464  break;
466  error_io++;
467  break;
468  default:
469  ;
470  }
471  }
472  }
474  StringBuilder msg = new StringBuilder();
475  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
476  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
477  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
478  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
479  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
480  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
481  msg.append("</table>"); //NON-NLS
482  String indexStats = msg.toString();
483  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
484  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
485  if (error_index > 0) {
486  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
487  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
488  } else if (error_io + error_text > 0) {
489  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
490  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
491  }
492  }
498  private class Indexer {
500  private final Logger logger = Logger.getLogger(Indexer.class.getName());
515  private boolean extractTextAndIndex(AbstractFile aFile, Map<String, String> extractedMetadata) throws IngesterException {
516  ImageConfig imageConfig = new ImageConfig();
517  imageConfig.setOCREnabled(KeywordSearchSettings.getOcrOption());
518  ProcessTerminator terminator = () -> context.fileIngestIsCancelled();
519  Lookup extractionContext = Lookups.fixed(imageConfig, terminator);
521  try {
522  TextExtractor extractor = TextExtractorFactory.getExtractor(aFile, extractionContext);
523  Reader fileText = extractor.getReader();
525  Reader finalReader;
526  try {
527  Map<String, String> metadata = extractor.getMetadata();
528  if (!metadata.isEmpty()) {
529  // Creating the metadata artifact here causes occasional problems
530  // when indexing the text, so we save the metadata map to
531  // use after this method is complete.
532  extractedMetadata.putAll(metadata);
533  }
534  CharSource formattedMetadata = getMetaDataCharSource(metadata);
535  //Append the metadata to end of the file text
536  finalReader = CharSource.concat(new CharSource() {
537  //Wrap fileText reader for concatenation
538  @Override
539  public Reader openStream() throws IOException {
540  return fileText;
541  }
542  }, formattedMetadata).openStream();
543  } catch (IOException ex) {
544  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
545  aFile.getName(), aFile.getId()), ex);
546  //Just send file text.
547  finalReader = fileText;
548  }
549  //divide into chunks and index
550  return Ingester.getDefault().indexText(finalReader, aFile.getId(), aFile.getName(), aFile, context);
552  //No text extractor found... run the default instead
553  return false;
554  }
555  }
557  private void createMetadataArtifact(AbstractFile aFile, Map<String, String> metadata) {
559  String moduleName = KeywordSearchIngestModule.class.getName();
561  Collection<BlackboardAttribute> attributes = new ArrayList<>();
562  Collection<BlackboardArtifact> bbartifacts = new ArrayList<>();
563  for (Map.Entry<String, String> entry : metadata.entrySet()) {
564  if (METADATA_TYPES_MAP.containsKey(entry.getKey())) {
565  BlackboardAttribute bba = checkAttribute(entry.getKey(), entry.getValue());
566  if (bba != null) {
567  attributes.add(bba);
568  }
569  }
570  }
571  if (!attributes.isEmpty()) {
572  try {
573  BlackboardArtifact bbart = aFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_METADATA);
574  bbart.addAttributes(attributes);
575  bbartifacts.add(bbart);
576  } catch (TskCoreException ex) {
577  // Log error and return to continue processing
578  logger.log(Level.WARNING, String.format("Error creating or adding metadata artifact for file %s.", aFile.getParentPath() + aFile.getName()), ex); //NON-NLS
579  return;
580  }
581  if (!bbartifacts.isEmpty()) {
582  try{
583  Case.getCurrentCaseThrows().getSleuthkitCase().getBlackboard().postArtifacts(bbartifacts, moduleName);
584  } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
585  // Log error and return to continue processing
586  logger.log(Level.WARNING, String.format("Unable to post blackboard artifacts for file $s.", aFile.getParentPath() + aFile.getName()) , ex); //NON-NLS
587  return;
588  }
589  }
590  }
591  }
594  private BlackboardAttribute checkAttribute(String key, String value) {
595  String moduleName = KeywordSearchIngestModule.class.getName();
596  if (!value.isEmpty() && value.charAt(0) != ' ') {
597  if (METADATA_DATE_TYPES.contains(key)) {
598  SimpleDateFormat metadataDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", US);
599  Long metadataDateTime = Long.valueOf(0);
600  try {
601  String metadataDate = value.replaceAll("T"," ").replaceAll("Z", "");
602  Date usedDate = metadataDateFormat.parse(metadataDate);
603  metadataDateTime = usedDate.getTime()/1000;
604  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, metadataDateTime);
605  } catch (ParseException ex) {
606  // catching error and displaying date that could not be parsed then will continue on.
607  logger.log(Level.WARNING, String.format("Failed to parse date/time %s for metadata attribute %s.", value, key), ex); //NON-NLS
608  return null;
609  }
610  } else {
611  return new BlackboardAttribute(METADATA_TYPES_MAP.get(key), moduleName, value);
612  }
613  }
615  return null;
617  }
627  @NbBundle.Messages({
628  "KeywordSearchIngestModule.metadataTitle=METADATA"
629  })
630  private CharSource getMetaDataCharSource(Map<String, String> metadata) {
631  return CharSource.wrap(new StringBuilder(
632  String.format("\n\n------------------------------%s------------------------------\n\n",
633  Bundle.KeywordSearchIngestModule_metadataTitle()))
634  .append(metadata.entrySet().stream().sorted(Map.Entry.comparingByKey())
635  .map(entry -> entry.getKey() + ": " + entry.getValue())
636  .collect(Collectors.joining("\n"))
637  ));
638  }
648  private boolean extractStringsAndIndex(AbstractFile aFile) {
649  try {
650  if (context.fileIngestIsCancelled()) {
651  return true;
652  }
653  TextExtractor stringsExtractor = TextExtractorFactory.getStringsExtractor(aFile, stringsExtractionContext);
654  Reader extractedTextReader = stringsExtractor.getReader();
655  if (Ingester.getDefault().indexText(extractedTextReader, aFile.getId(), aFile.getName(), aFile, KeywordSearchIngestModule.this.context)) {
656  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
657  return true;
658  } else {
659  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
660  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
661  return false;
662  }
663  } catch (IngesterException | TextExtractor.InitReaderException ex) {
664  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
665  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
666  return false;
667  }
668  }
677  private void indexFile(AbstractFile aFile, boolean indexContent) {
678  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
680  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
689  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
690  || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))
691  || (aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED) && aFile.getNameExtension().equalsIgnoreCase("txt"))) {
692  if (context.fileIngestIsCancelled()) {
693  return;
694  }
695  extractStringsAndIndex(aFile);
696  return;
697  }
699  final long size = aFile.getSize();
700  //if not to index content, or a dir, or 0 content, index meta data only
702  if ((indexContent == false || aFile.isDir() || size == 0)) {
703  try {
704  if (context.fileIngestIsCancelled()) {
705  return;
706  }
707  ingester.indexMetaDataOnly(aFile);
708  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
709  } catch (IngesterException ex) {
710  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
711  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
712  }
713  return;
714  }
716  if (context.fileIngestIsCancelled()) {
717  return;
718  }
719  String fileType = fileTypeDetector.getMIMEType(aFile);
721  // we skip archive formats that are opened by the archive module.
722  // @@@ We could have a check here to see if the archive module was enabled though...
723  if (ARCHIVE_MIME_TYPES.contains(fileType)) {
724  try {
725  if (context.fileIngestIsCancelled()) {
726  return;
727  }
728  ingester.indexMetaDataOnly(aFile);
729  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
730  } catch (IngesterException ex) {
731  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
732  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
733  }
734  return;
735  }
737  boolean wasTextAdded = false;
738  Map<String, String> extractedMetadata = new HashMap<>();
740  //extract text with one of the extractors, divide into chunks and index with Solr
741  try {
742  //logger.log(Level.INFO, "indexing: " + aFile.getName());
743  if (context.fileIngestIsCancelled()) {
744  return;
745  }
746  if (fileType.equals(MimeTypes.OCTET_STREAM)) {
747  extractStringsAndIndex(aFile);
748  return;
749  }
750  if (!extractTextAndIndex(aFile, extractedMetadata)) {
751  // Text extractor not found for file. Extract string only.
752  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
753  } else {
754  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
755  wasTextAdded = true;
756  }
758  } catch (IngesterException e) {
759  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
760  + aFile.getName(), e);
761  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
762  } catch (Exception e) {
763  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
764  + aFile.getName(), e);
765  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
766  }
768  if ((wasTextAdded == false) && (aFile.getNameExtension().equalsIgnoreCase("txt") && !(aFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.CARVED)))) {
769  //Carved Files should be the only type of unallocated files capable of a txt extension and
770  //should be ignored by the TextFileExtractor because they may contain more than one text encoding
771  wasTextAdded = indexTextFile(aFile);
772  }
774  // if it wasn't supported or had an error, default to strings
775  if (wasTextAdded == false) {
776  extractStringsAndIndex(aFile);
777  }
779  // Now that the indexing is complete, create the metadata artifact (if applicable).
780  // It is unclear why calling this from extractTextAndIndex() generates
781  // errors.
782  if (!extractedMetadata.isEmpty()) {
783  createMetadataArtifact(aFile, extractedMetadata);
784  }
785  }
793  private boolean indexTextFile(AbstractFile aFile) {
794  try {
795  TextFileExtractor textFileExtractor = new TextFileExtractor(aFile);
796  Reader textReader = textFileExtractor.getReader();
797  if (textReader == null) {
798  logger.log(Level.INFO, "Unable to extract with TextFileExtractor, Reader was null for file: {0}", aFile.getName());
799  } else if (Ingester.getDefault().indexText(textReader, aFile.getId(), aFile.getName(), aFile, context)) {
800  textReader.close();
801  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
802  return true;
803  }
804  } catch (IngesterException | IOException | TextExtractor.InitReaderException ex) {
805  logger.log(Level.WARNING, "Unable to index " + aFile.getName(), ex);
806  }
807  return false;
808  }
809  }
810 }
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
boolean extractTextAndIndex(AbstractFile aFile, Map< String, String > extractedMetadata)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static TextExtractor getStringsExtractor(Content content, Lookup context)
static TextExtractor getExtractor(Content content, Lookup context)
void createMetadataArtifact(AbstractFile aFile, Map< String, String > metadata)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2020 Basis Technology. Generated on: Tue Sep 22 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.