Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2016 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
47 
56 @NbBundle.Messages({
57  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
58  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
59  "SolrConnectionCheck.Port=Invalid port number.",
60  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
61  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
62  "CannotRunFileTypeDetection=Unable to run file type detection."
63 })
64 public final class KeywordSearchIngestModule implements FileIngestModule {
65 
66  enum UpdateFrequency {
67 
68  FAST(20),
69  AVG(10),
70  SLOW(5),
71  SLOWEST(1),
72  NONE(Integer.MAX_VALUE),
73  DEFAULT(5);
74  private final int time;
75 
76  UpdateFrequency(int time) {
77  this.time = time;
78  }
79 
80  int getTime() {
81  return time;
82  }
83  };
84  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
85  private final IngestServices services = IngestServices.getInstance();
86  private Ingester ingester = null;
87  private Indexer indexer;
89 //only search images from current ingest, not images previously ingested/indexed
90  //accessed read-only by searcher thread
91 
92  private boolean startedSearching = false;
93  private List<FileTextExtractor> textExtractors;
94  private StringsTextExtractor stringExtractor;
95  private final KeywordSearchJobSettings settings;
96  private boolean initialized = false;
97  private long jobId;
98  private long dataSourceId;
99  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
100  private int instanceNum = 0;
101  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
103 
104  private enum IngestStatus {
105 
111  SKIPPED_ERROR_IO
112  };
113  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
114 
123  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
124  synchronized (ingestStatus) {
125  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
126  if (ingestStatusForJob == null) {
127  ingestStatusForJob = new HashMap<>();
128  ingestStatus.put(ingestJobId, ingestStatusForJob);
129  }
130  ingestStatusForJob.put(fileId, status);
131  ingestStatus.put(ingestJobId, ingestStatusForJob);
132  }
133  }
134 
135  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
136  this.settings = settings;
137  instanceNum = instanceCount.getAndIncrement();
138  }
139 
145  @Messages({
146  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
147  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
148  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index."
149  })
150  @Override
151  public void startUp(IngestJobContext context) throws IngestModuleException {
152  initialized = false;
153  jobId = context.getJobId();
154  dataSourceId = context.getDataSource().getId();
155 
156  Server server = KeywordSearch.getServer();
157  if (server.coreIsOpen() == false) {
158  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
159  }
160 
161  try {
162  Index indexInfo = server.getIndexInfo();
163  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
164  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
165  }
166  if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
167  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
168  }
169  } catch (NoOpenCoreException ex) {
170  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
171  }
172 
173  try {
174  fileTypeDetector = new FileTypeDetector();
176  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
177  }
178 
179  ingester = Ingester.getDefault();
180  this.context = context;
181 
182  // increment the module reference count
183  // if first instance of this module for this job then check the server and existence of keywords
184  if (refCounter.incrementAndGet(jobId) == 1) {
186  // for multi-user cases need to verify connection to remore SOLR server
187  KeywordSearchService kwsService = new SolrSearchService();
188  int port;
189  try {
190  port = Integer.parseInt(UserPreferences.getIndexingServerPort());
191  } catch (NumberFormatException ex) {
192  // if there is an error parsing the port number
193  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
194  }
195  try {
197  } catch (KeywordSearchServiceException ex) {
198  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
199  }
200  } else {
201  // for single-user cases need to verify connection to local SOLR service
202  try {
203  if (!server.isRunning()) {
204  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
205  }
206  } catch (KeywordSearchModuleException ex) {
207  //this means Solr is not properly initialized
208  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
209  }
210  try {
211  // make an actual query to verify that server is responding
212  // we had cases where getStatus was OK, but the connection resulted in a 404
213  server.queryNumIndexedDocuments();
215  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
216  }
217 
218  // check if this job has any searchable keywords
219  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
220  boolean hasKeywordsForSearch = false;
221  for (KeywordList keywordList : keywordLists) {
222  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
223  hasKeywordsForSearch = true;
224  break;
225  }
226  }
227  if (!hasKeywordsForSearch) {
228  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
229  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
230  }
231  }
232  }
233 
234  //initialize extractors
235  stringExtractor = new StringsTextExtractor();
236  stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
237  stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
238 
239  textExtractors = new ArrayList<>();
240  //order matters, more specific extractors first
241  textExtractors.add(new HtmlTextExtractor());
242  textExtractors.add(new TikaTextExtractor());
243 
244  indexer = new Indexer();
245  initialized = true;
246  }
247 
248  @Override
249  public ProcessResult process(AbstractFile abstractFile) {
250  if (initialized == false) //error initializing indexing/Solr
251  {
252  logger.log(Level.WARNING, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
253  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
254  return ProcessResult.OK;
255  }
256 
257  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
258  //skip indexing of virtual dirs (no content, no real name) - will index children files
259  return ProcessResult.OK;
260  }
261 
262  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
263  //index meta-data only
264  if (context.fileIngestIsCancelled()) {
265  return ProcessResult.OK;
266  }
267  indexer.indexFile(abstractFile, false);
268  return ProcessResult.OK;
269  }
270 
271  //index the file and content (if the content is supported)
272  if (context.fileIngestIsCancelled()) {
273  return ProcessResult.OK;
274  }
275  indexer.indexFile(abstractFile, true);
276 
277  // Start searching if it hasn't started already
278  if (!startedSearching) {
279  if (context.fileIngestIsCancelled()) {
280  return ProcessResult.OK;
281  }
282  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
283  SearchRunner.getInstance().startJob(jobId, dataSourceId, keywordListNames);
284  startedSearching = true;
285  }
286 
287  return ProcessResult.OK;
288  }
289 
294  @Override
295  public void shutDown() {
296  logger.log(Level.INFO, "Instance {0}", instanceNum); //NON-NLS
297 
298  if ((initialized == false) || (context == null)) {
299  return;
300  }
301 
302  if (context.fileIngestIsCancelled()) {
303  stop();
304  return;
305  }
306 
307  // Remove from the search list and trigger final commit and final search
309 
310  // We only need to post the summary msg from the last module per job
311  if (refCounter.decrementAndGet(jobId) == 0) {
312  postIndexSummary();
313  synchronized (ingestStatus) {
314  ingestStatus.remove(jobId);
315  }
316  }
317 
318  //log number of files / chunks in index
319  //signal a potential change in number of text_ingested files
320  try {
321  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
322  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
323  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
324  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
326  logger.log(Level.WARNING, "Error executing Solr query to check number of indexed files/chunks: ", ex); //NON-NLS
327  }
328 
329  cleanup();
330  }
331 
335  private void stop() {
336  logger.log(Level.INFO, "stop()"); //NON-NLS
337 
339 
340  cleanup();
341  }
342 
346  private void cleanup() {
347  textExtractors.clear();
348  textExtractors = null;
349  stringExtractor = null;
350 
351  initialized = false;
352  }
353 
357  private void postIndexSummary() {
358  int text_ingested = 0;
359  int metadata_ingested = 0;
360  int strings_ingested = 0;
361  int error_text = 0;
362  int error_index = 0;
363  int error_io = 0;
364 
365  synchronized (ingestStatus) {
366  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
367  if (ingestStatusForJob == null) {
368  return;
369  }
370  for (IngestStatus s : ingestStatusForJob.values()) {
371  switch (s) {
372  case TEXT_INGESTED:
373  text_ingested++;
374  break;
375  case METADATA_INGESTED:
376  metadata_ingested++;
377  break;
378  case STRINGS_INGESTED:
379  strings_ingested++;
380  break;
381  case SKIPPED_ERROR_TEXTEXTRACT:
382  error_text++;
383  break;
384  case SKIPPED_ERROR_INDEXING:
385  error_index++;
386  break;
387  case SKIPPED_ERROR_IO:
388  error_io++;
389  break;
390  default:
391  ;
392  }
393  }
394  }
395 
396  StringBuilder msg = new StringBuilder();
397  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
398  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
399  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
400  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
401  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
402  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
403  msg.append("</table>"); //NON-NLS
404  String indexStats = msg.toString();
405  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
406  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
407  if (error_index > 0) {
408  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
409  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
410  } else if (error_io + error_text > 0) {
411  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
412  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
413  }
414  }
415 
420  private class Indexer {
421 
422  private final Logger logger = Logger.getLogger(Indexer.class.getName());
423 
437  private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
438  FileTextExtractor extractor = null;
439 
440  //go over available text extractors in order, and pick the first one (most specific one)
441  for (FileTextExtractor fe : textExtractors) {
442  if (fe.isSupported(aFile, detectedFormat)) {
443  extractor = fe;
444  break;
445  }
446  }
447 
448  if (extractor == null) {
449  logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
450  return false;
451  }
452 
453  //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
454  //divide into chunks and index
455  return Ingester.getDefault().indexText(extractor, aFile, context);
456  }
457 
466  private boolean extractStringsAndIndex(AbstractFile aFile) {
467  try {
468  if (context.fileIngestIsCancelled()) {
469  return true;
470  }
471  if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
472  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
473  return true;
474  } else {
475  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
476  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
477  return false;
478  }
479  } catch (IngesterException ex) {
480  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
481  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
482  return false;
483  }
484  }
485 
493  private void indexFile(AbstractFile aFile, boolean indexContent) {
494  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
495 
496  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
497 
498  // unallocated and unused blocks can only have strings extracted from them.
500  if (context.fileIngestIsCancelled()) {
501  return;
502  }
503  extractStringsAndIndex(aFile);
504  return;
505  }
506 
507  final long size = aFile.getSize();
508  //if not to index content, or a dir, or 0 content, index meta data only
509 
510  if ((indexContent == false || aFile.isDir() || size == 0)) {
511  try {
512  if (context.fileIngestIsCancelled()) {
513  return;
514  }
515  ingester.indexMetaDataOnly(aFile);
516  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
517  } catch (IngesterException ex) {
518  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
519  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
520  }
521  return;
522  }
523 
524  String fileType;
525  try {
526  if (context.fileIngestIsCancelled()) {
527  return;
528  }
529  fileType = fileTypeDetector.getFileType(aFile);
530  } catch (TskCoreException ex) {
531  logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
532  return;
533  }
534 
535  // we skip archive formats that are opened by the archive module.
536  // @@@ We could have a check here to see if the archive module was enabled though...
537  if (FileTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
538  try {
539  if (context.fileIngestIsCancelled()) {
540  return;
541  }
542  ingester.indexMetaDataOnly(aFile);
543  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
544  } catch (IngesterException ex) {
545  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
546  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
547  }
548  return;
549  }
550 
551  boolean wasTextAdded = false;
552 
553  //extract text with one of the extractors, divide into chunks and index with Solr
554  try {
555  //logger.log(Level.INFO, "indexing: " + aFile.getName());
556  if (context.fileIngestIsCancelled()) {
557  return;
558  }
559  if (fileType.equals("application/octet-stream")) {
560  extractStringsAndIndex(aFile);
561  return;
562  }
563  if (!extractTextAndIndex(aFile, fileType)) {
564  logger.log(Level.WARNING, "Text extractor not found for file. Extracting strings only. File: ''{0}'' (id:{1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
565  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
566  } else {
567  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
568  wasTextAdded = true;
569  }
570 
571  } catch (IngesterException e) {
572  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
573  + aFile.getName(), e);
574  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
575  } catch (Exception e) {
576  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
577  + aFile.getName(), e);
578  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
579  }
580 
581  // if it wasn't supported or had an error, default to strings
582  if (wasTextAdded == false) {
583  extractStringsAndIndex(aFile);
584  }
585  }
586  }
587 }
TskData.TSK_DB_FILES_TYPE_ENUM getType()
synchronized void startJob(long jobId, long dataSourceId, List< String > keywordListNames)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
static synchronized SearchRunner getInstance()
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
synchronized static Logger getLogger(String name)
Definition: Logger.java:161
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2016 Basis Technology. Generated on: Mon Apr 24 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.