Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
42 import org.sleuthkit.datamodel.AbstractFile;
43 import org.sleuthkit.datamodel.TskCoreException;
44 import org.sleuthkit.datamodel.TskData;
45 import org.sleuthkit.datamodel.TskData.FileKnown;
46 
55 @NbBundle.Messages({
56  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
57  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
58  "SolrConnectionCheck.Port=Invalid port number.",
59  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
60  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
61  "CannotRunFileTypeDetection=Unable to run file type detection."
62 })
63 public final class KeywordSearchIngestModule implements FileIngestModule {
64 
65  enum UpdateFrequency {
66 
67  FAST(20),
68  AVG(10),
69  SLOW(5),
70  SLOWEST(1),
71  NONE(Integer.MAX_VALUE),
72  DEFAULT(5);
73  private final int time;
74 
75  UpdateFrequency(int time) {
76  this.time = time;
77  }
78 
79  int getTime() {
80  return time;
81  }
82  };
83  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
84  private final IngestServices services = IngestServices.getInstance();
85  private Ingester ingester = null;
86  private Indexer indexer;
88 //only search images from current ingest, not images previously ingested/indexed
89  //accessed read-only by searcher thread
90 
91  private boolean startedSearching = false;
92  private List<TextExtractor> textExtractors;
93  private StringsTextExtractor stringExtractor;
94  private final KeywordSearchJobSettings settings;
95  private boolean initialized = false;
96  private long jobId;
97  private long dataSourceId;
98  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
99  private int instanceNum = 0;
100  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
102 
103  private enum IngestStatus {
104 
110  SKIPPED_ERROR_IO
111  };
112  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
113 
121  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
122  synchronized (ingestStatus) {
123  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
124  if (ingestStatusForJob == null) {
125  ingestStatusForJob = new HashMap<>();
126  ingestStatus.put(ingestJobId, ingestStatusForJob);
127  }
128  ingestStatusForJob.put(fileId, status);
129  ingestStatus.put(ingestJobId, ingestStatusForJob);
130  }
131  }
132 
133  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
134  this.settings = settings;
135  instanceNum = instanceCount.getAndIncrement();
136  }
137 
143  @Override
144  public void startUp(IngestJobContext context) throws IngestModuleException {
145  initialized = false;
146  jobId = context.getJobId();
147  dataSourceId = context.getDataSource().getId();
148 
149  Server server = KeywordSearch.getServer();
150  if (server.coreIsOpen() == false) {
151  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
152  }
153 
154  try {
155  fileTypeDetector = new FileTypeDetector();
157  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
158  }
159  ingester = Server.getIngester();
160  this.context = context;
161 
162  // increment the module reference count
163  // if first instance of this module for this job then check the server and existence of keywords
164  if (refCounter.incrementAndGet(jobId) == 1) {
166  // for multi-user cases need to verify connection to remore SOLR server
167  KeywordSearchService kwsService = new SolrSearchService();
168  int port;
169  try {
170  port = Integer.parseInt(UserPreferences.getIndexingServerPort());
171  } catch (NumberFormatException ex) {
172  // if there is an error parsing the port number
173  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
174  }
175  try {
177  } catch (KeywordSearchServiceException ex) {
178  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
179  }
180  } else {
181  // for single-user cases need to verify connection to local SOLR service
182  try {
183  if (!server.isRunning()) {
184  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
185  }
186  } catch (KeywordSearchModuleException ex) {
187  //this means Solr is not properly initialized
188  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
189  }
190  try {
191  // make an actual query to verify that server is responding
192  // we had cases where getStatus was OK, but the connection resulted in a 404
193  server.queryNumIndexedDocuments();
195  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
196  }
197 
198  // check if this job has any searchable keywords
199  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
200  boolean hasKeywordsForSearch = false;
201  for (KeywordList keywordList : keywordLists) {
202  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
203  hasKeywordsForSearch = true;
204  break;
205  }
206  }
207  if (!hasKeywordsForSearch) {
208  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
209  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
210  }
211  }
212  }
213 
214  //initialize extractors
215  stringExtractor = new StringsTextExtractor();
216  stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
217  stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
218 
219  textExtractors = new ArrayList<>();
220  //order matters, more specific extractors first
221  textExtractors.add(new HtmlTextExtractor());
222  textExtractors.add(new TikaTextExtractor());
223 
224  indexer = new Indexer();
225  initialized = true;
226  }
227 
228  @Override
229  public ProcessResult process(AbstractFile abstractFile) {
230  if (initialized == false) //error initializing indexing/Solr
231  {
232  logger.log(Level.WARNING, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
233  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
234  return ProcessResult.OK;
235  }
236 
237  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
238  //skip indexing of virtual dirs (no content, no real name) - will index children files
239  return ProcessResult.OK;
240  }
241 
242  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
243  //index meta-data only
244  if (context.fileIngestIsCancelled()) {
245  return ProcessResult.OK;
246  }
247  indexer.indexFile(abstractFile, false);
248  return ProcessResult.OK;
249  }
250 
251  //index the file and content (if the content is supported)
252  if (context.fileIngestIsCancelled()) {
253  return ProcessResult.OK;
254  }
255  indexer.indexFile(abstractFile, true);
256 
257  // Start searching if it hasn't started already
258  if (!startedSearching) {
259  if (context.fileIngestIsCancelled()) {
260  return ProcessResult.OK;
261  }
262  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
263  SearchRunner.getInstance().startJob(jobId, dataSourceId, keywordListNames);
264  startedSearching = true;
265  }
266 
267  return ProcessResult.OK;
268  }
269 
274  @Override
275  public void shutDown() {
276  logger.log(Level.INFO, "Instance {0}", instanceNum); //NON-NLS
277 
278  if ((initialized == false) || (context == null)) {
279  return;
280  }
281 
282  if (context.fileIngestIsCancelled()) {
283  stop();
284  return;
285  }
286 
287  // Remove from the search list and trigger final commit and final search
289 
290  // We only need to post the summary msg from the last module per job
291  if (refCounter.decrementAndGet(jobId) == 0) {
292  postIndexSummary();
293  synchronized (ingestStatus) {
294  ingestStatus.remove(jobId);
295  }
296  }
297 
298  //log number of files / chunks in index
299  //signal a potential change in number of text_ingested files
300  try {
301  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
302  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
303  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
304  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
306  logger.log(Level.WARNING, "Error executing Solr query to check number of indexed files/chunks: ", ex); //NON-NLS
307  }
308 
309  cleanup();
310  }
311 
315  private void stop() {
316  logger.log(Level.INFO, "stop()"); //NON-NLS
317 
319 
320  cleanup();
321  }
322 
326  private void cleanup() {
327  textExtractors.clear();
328  textExtractors = null;
329  stringExtractor = null;
330 
331  initialized = false;
332  }
333 
337  private void postIndexSummary() {
338  int text_ingested = 0;
339  int metadata_ingested = 0;
340  int strings_ingested = 0;
341  int error_text = 0;
342  int error_index = 0;
343  int error_io = 0;
344 
345  synchronized (ingestStatus) {
346  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
347  if (ingestStatusForJob == null) {
348  return;
349  }
350  for (IngestStatus s : ingestStatusForJob.values()) {
351  switch (s) {
352  case TEXT_INGESTED:
353  text_ingested++;
354  break;
355  case METADATA_INGESTED:
356  metadata_ingested++;
357  break;
358  case STRINGS_INGESTED:
359  strings_ingested++;
360  break;
361  case SKIPPED_ERROR_TEXTEXTRACT:
362  error_text++;
363  break;
364  case SKIPPED_ERROR_INDEXING:
365  error_index++;
366  break;
367  case SKIPPED_ERROR_IO:
368  error_io++;
369  break;
370  default:
371  ;
372  }
373  }
374  }
375 
376  StringBuilder msg = new StringBuilder();
377  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
378  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
379  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
380  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
381  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
382  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
383  msg.append("</table>"); //NON-NLS
384  String indexStats = msg.toString();
385  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
386  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
387  if (error_index > 0) {
388  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
389  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
390  } else if (error_io + error_text > 0) {
391  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
392  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
393  }
394  }
395 
400  private class Indexer {
401 
402  private final Logger logger = Logger.getLogger(Indexer.class.getName());
403 
417  private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
418  TextExtractor fileExtract = null;
419 
420  //go over available text extractors in order, and pick the first one (most specific one)
421  for (TextExtractor fe : textExtractors) {
422  if (fe.isSupported(aFile, detectedFormat)) {
423  fileExtract = fe;
424  break;
425  }
426  }
427 
428  if (fileExtract == null) {
429  logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
430  return false;
431  }
432 
433  //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
434  //divide into chunks and index
435  return fileExtract.index(aFile, context);
436  }
437 
446  private boolean extractStringsAndIndex(AbstractFile aFile) {
447  try {
448  if (context.fileIngestIsCancelled()) {
449  return true;
450  }
451  if (stringExtractor.index(aFile, KeywordSearchIngestModule.this.context)) {
452  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
453  return true;
454  } else {
455  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
456  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
457  return false;
458  }
459  } catch (IngesterException ex) {
460  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
461  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
462  return false;
463  }
464  }
465 
476  private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
477  for (TextExtractor extractor : textExtractors) {
478  if (extractor.isContentTypeSpecific() == true
479  && extractor.isSupported(aFile, detectedFormat)) {
480  return true;
481  }
482  }
483  return false;
484  }
485 
493  private void indexFile(AbstractFile aFile, boolean indexContent) {
494  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
495 
496  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
497 
498  // unallocated and unused blocks can only have strings extracted from them.
499  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
500  if (context.fileIngestIsCancelled()) {
501  return;
502  }
503  extractStringsAndIndex(aFile);
504  return;
505  }
506 
507  final long size = aFile.getSize();
508  //if not to index content, or a dir, or 0 content, index meta data only
509 
510  if ((indexContent == false || aFile.isDir() || size == 0)) {
511  try {
512  if (context.fileIngestIsCancelled()) {
513  return;
514  }
515  ingester.ingest(aFile, false); //meta-data only
516  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
517  } catch (IngesterException ex) {
518  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
519  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
520  }
521  return;
522  }
523 
524  String fileType;
525  try {
526  if (context.fileIngestIsCancelled()) {
527  return;
528  }
529  fileType = fileTypeDetector.getFileType(aFile);
530  } catch (TskCoreException ex) {
531  logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
532  return;
533  }
534 
535  // we skip archive formats that are opened by the archive module.
536  // @@@ We could have a check here to see if the archive module was enabled though...
537  if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
538  try {
539  if (context.fileIngestIsCancelled()) {
540  return;
541  }
542  ingester.ingest(aFile, false); //meta-data only
543  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
544  } catch (IngesterException ex) {
545  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
546  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
547  }
548  return;
549  }
550 
551  boolean wasTextAdded = false;
552 
553  //extract text with one of the extractors, divide into chunks and index with Solr
554  try {
555  //logger.log(Level.INFO, "indexing: " + aFile.getName());
556  if (context.fileIngestIsCancelled()) {
557  return;
558  }
559  if (fileType.equals("application/octet-stream")) {
560  extractStringsAndIndex(aFile);
561  return;
562  }
563  if (!extractTextAndIndex(aFile, fileType)) {
564  logger.log(Level.WARNING, "Text extractor not found for file. Extracting strings only. File: ''{0}'' (id:{1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
565  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
566  } else {
567  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
568  wasTextAdded = true;
569  }
570 
571  } catch (IngesterException e) {
572  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
573  + aFile.getName(), e);
574  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
575  } catch (Exception e) {
576  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
577  + aFile.getName(), e);
578  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
579  }
580 
581  // if it wasn't supported or had an error, default to strings
582  if (wasTextAdded == false) {
583  extractStringsAndIndex(aFile);
584  }
585  }
586  }
587 }
boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat)
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static synchronized SearchRunner getInstance()
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:161
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2016 Basis Technology. Generated on: Mon Jan 2 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.