Autopsy  4.7.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
KeywordSearchIngestModule.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.atomic.AtomicInteger;
26 import java.util.logging.Level;
27 import org.openide.util.NbBundle;
28 import org.openide.util.NbBundle.Messages;
44 import org.sleuthkit.datamodel.AbstractFile;
45 import org.sleuthkit.datamodel.TskData;
46 import org.sleuthkit.datamodel.TskData.FileKnown;
47 
56 @NbBundle.Messages({
57  "# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
58  "KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
59  "SolrConnectionCheck.Port=Invalid port number.",
60  "# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
61  "KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
62  "CannotRunFileTypeDetection=Unable to run file type detection."
63 })
64 public final class KeywordSearchIngestModule implements FileIngestModule {
65 
66  enum UpdateFrequency {
67 
68  FAST(20),
69  AVG(10),
70  SLOW(5),
71  SLOWEST(1),
72  NONE(Integer.MAX_VALUE),
73  DEFAULT(5);
74  private final int time;
75 
76  UpdateFrequency(int time) {
77  this.time = time;
78  }
79 
80  int getTime() {
81  return time;
82  }
83  };
84  private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
85  private final IngestServices services = IngestServices.getInstance();
86  private Ingester ingester = null;
87  private Indexer indexer;
89 //only search images from current ingest, not images previously ingested/indexed
90  //accessed read-only by searcher thread
91 
92  private boolean startedSearching = false;
93  private List<ContentTextExtractor> textExtractors;
94  private StringsTextExtractor stringExtractor;
95  private final KeywordSearchJobSettings settings;
96  private boolean initialized = false;
97  private long jobId;
98  private long dataSourceId;
99  private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
100  private int instanceNum = 0;
101  private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
103 
104  private enum IngestStatus {
105 
111  SKIPPED_ERROR_IO
112  };
113  private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
114 
123  private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
124  synchronized (ingestStatus) {
125  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
126  if (ingestStatusForJob == null) {
127  ingestStatusForJob = new HashMap<>();
128  ingestStatus.put(ingestJobId, ingestStatusForJob);
129  }
130  ingestStatusForJob.put(fileId, status);
131  ingestStatus.put(ingestJobId, ingestStatusForJob);
132  }
133  }
134 
135  KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
136  this.settings = settings;
137  instanceNum = instanceCount.getAndIncrement();
138  }
139 
145  @Messages({
146  "KeywordSearchIngestModule.startupMessage.failedToGetIndexSchema=Failed to get schema version for text index.",
147  "# {0} - Solr version number", "KeywordSearchIngestModule.startupException.indexSolrVersionNotSupported=Adding text no longer supported for Solr version {0} of the text index.",
148  "# {0} - schema version number", "KeywordSearchIngestModule.startupException.indexSchemaNotSupported=Adding text no longer supported for schema version {0} of the text index.",
149  "KeywordSearchIngestModule.noOpenCase.errMsg=No open case available."
150  })
151  @Override
152  public void startUp(IngestJobContext context) throws IngestModuleException {
153  initialized = false;
154  jobId = context.getJobId();
155  dataSourceId = context.getDataSource().getId();
156 
157  Server server = KeywordSearch.getServer();
158  if (server.coreIsOpen() == false) {
159  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
160  }
161 
162  try {
163  Index indexInfo = server.getIndexInfo();
164  if (!IndexFinder.getCurrentSolrVersion().equals(indexInfo.getSolrVersion())) {
165  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSolrVersionNotSupported(indexInfo.getSolrVersion()));
166  }
167  if (!IndexFinder.getCurrentSchemaVersion().equals(indexInfo.getSchemaVersion())) {
168  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupException_indexSchemaNotSupported(indexInfo.getSchemaVersion()));
169  }
170  } catch (NoOpenCoreException ex) {
171  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startupMessage_failedToGetIndexSchema(), ex);
172  }
173 
174  try {
175  fileTypeDetector = new FileTypeDetector();
177  throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
178  }
179 
180  ingester = Ingester.getDefault();
181  this.context = context;
182 
183  // increment the module reference count
184  // if first instance of this module for this job then check the server and existence of keywords
185  Case openCase;
186  try {
187  openCase = Case.getCurrentCaseThrows();
188  } catch (NoCurrentCaseException ex) {
189  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_noOpenCase_errMsg(), ex);
190  }
191  if (refCounter.incrementAndGet(jobId) == 1) {
192  if (openCase.getCaseType() == Case.CaseType.MULTI_USER_CASE) {
193  // for multi-user cases need to verify connection to remore SOLR server
194  KeywordSearchService kwsService = new SolrSearchService();
196  int port;
197  try {
198  port = Integer.parseInt(properties.getPort());
199  } catch (NumberFormatException ex) {
200  // if there is an error parsing the port number
201  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
202  }
203  try {
204  kwsService.tryConnect(properties.getHost(), port);
205  } catch (KeywordSearchServiceException ex) {
206  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
207  }
208  } else {
209  // for single-user cases need to verify connection to local SOLR service
210  try {
211  if (!server.isRunning()) {
212  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
213  }
214  } catch (KeywordSearchModuleException ex) {
215  //this means Solr is not properly initialized
216  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
217  }
218  try {
219  // make an actual query to verify that server is responding
220  // we had cases where getStatus was OK, but the connection resulted in a 404
221  server.queryNumIndexedDocuments();
223  throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
224  }
225 
226  // check if this job has any searchable keywords
227  List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
228  boolean hasKeywordsForSearch = false;
229  for (KeywordList keywordList : keywordLists) {
230  if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
231  hasKeywordsForSearch = true;
232  break;
233  }
234  }
235  if (!hasKeywordsForSearch) {
236  services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
237  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
238  }
239  }
240  }
241 
242  //initialize extractors
243  stringExtractor = new StringsTextExtractor();
244  stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
245  stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
246 
247  textExtractors = new ArrayList<>();
248  //order matters, more specific extractors first
249  textExtractors.add(new HtmlTextExtractor());
250  textExtractors.add(new TikaTextExtractor());
251 
252  indexer = new Indexer();
253  initialized = true;
254  }
255 
256  @Override
257  public ProcessResult process(AbstractFile abstractFile) {
258  if (initialized == false) //error initializing indexing/Solr
259  {
260  logger.log(Level.SEVERE, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
261  putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
262  return ProcessResult.OK;
263  }
264 
265  if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
266  //skip indexing of virtual dirs (no content, no real name) - will index children files
267  return ProcessResult.OK;
268  }
269 
270  if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
271  //index meta-data only
272  if (context.fileIngestIsCancelled()) {
273  return ProcessResult.OK;
274  }
275  indexer.indexFile(abstractFile, false);
276  return ProcessResult.OK;
277  }
278 
279  //index the file and content (if the content is supported)
280  if (context.fileIngestIsCancelled()) {
281  return ProcessResult.OK;
282  }
283  indexer.indexFile(abstractFile, true);
284 
285  // Start searching if it hasn't started already
286  if (!startedSearching) {
287  if (context.fileIngestIsCancelled()) {
288  return ProcessResult.OK;
289  }
290  List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
291  IngestSearchRunner.getInstance().startJob(context, keywordListNames);
292  startedSearching = true;
293  }
294 
295  return ProcessResult.OK;
296  }
297 
302  @Override
303  public void shutDown() {
304  logger.log(Level.INFO, "Keyword search ingest module instance {0} shutting down", instanceNum); //NON-NLS
305 
306  if ((initialized == false) || (context == null)) {
307  return;
308  }
309 
310  if (context.fileIngestIsCancelled()) {
311  logger.log(Level.INFO, "Keyword search ingest module instance {0} stopping search job due to ingest cancellation", instanceNum); //NON-NLS
312  IngestSearchRunner.getInstance().stopJob(jobId);
313  cleanup();
314  return;
315  }
316 
317  // Remove from the search list and trigger final commit and final search
318  IngestSearchRunner.getInstance().endJob(jobId);
319 
320  // We only need to post the summary msg from the last module per job
321  if (refCounter.decrementAndGet(jobId) == 0) {
322  try {
323  final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
324  logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
325  final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
326  logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
328  logger.log(Level.SEVERE, "Error executing Solr queries to check number of indexed files and file chunks", ex); //NON-NLS
329  }
330  postIndexSummary();
331  synchronized (ingestStatus) {
332  ingestStatus.remove(jobId);
333  }
334  }
335 
336  cleanup();
337  }
338 
342  private void cleanup() {
343  textExtractors.clear();
344  textExtractors = null;
345  stringExtractor = null;
346 
347  initialized = false;
348  }
349 
353  private void postIndexSummary() {
354  int text_ingested = 0;
355  int metadata_ingested = 0;
356  int strings_ingested = 0;
357  int error_text = 0;
358  int error_index = 0;
359  int error_io = 0;
360 
361  synchronized (ingestStatus) {
362  Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
363  if (ingestStatusForJob == null) {
364  return;
365  }
366  for (IngestStatus s : ingestStatusForJob.values()) {
367  switch (s) {
368  case TEXT_INGESTED:
369  text_ingested++;
370  break;
371  case METADATA_INGESTED:
372  metadata_ingested++;
373  break;
374  case STRINGS_INGESTED:
375  strings_ingested++;
376  break;
377  case SKIPPED_ERROR_TEXTEXTRACT:
378  error_text++;
379  break;
380  case SKIPPED_ERROR_INDEXING:
381  error_index++;
382  break;
383  case SKIPPED_ERROR_IO:
384  error_io++;
385  break;
386  default:
387  ;
388  }
389  }
390  }
391 
392  StringBuilder msg = new StringBuilder();
393  msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
394  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
395  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
396  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
397  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
398  msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
399  msg.append("</table>"); //NON-NLS
400  String indexStats = msg.toString();
401  logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
402  services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
403  if (error_index > 0) {
404  MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
405  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
406  } else if (error_io + error_text > 0) {
407  MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
408  NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
409  }
410  }
411 
416  private class Indexer {
417 
418  private final Logger logger = Logger.getLogger(Indexer.class.getName());
419 
433  private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
434  ContentTextExtractor extractor = null;
435 
436  //go over available text extractors in order, and pick the first one (most specific one)
437  for (ContentTextExtractor fe : textExtractors) {
438  if (fe.isSupported(aFile, detectedFormat)) {
439  extractor = fe;
440  break;
441  }
442  }
443 
444  if (extractor == null) {
445  // No text extractor found.
446  return false;
447  }
448 
449  //logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
450  //divide into chunks and index
451  return Ingester.getDefault().indexText(extractor, aFile, context);
452  }
453 
462  private boolean extractStringsAndIndex(AbstractFile aFile) {
463  try {
464  if (context.fileIngestIsCancelled()) {
465  return true;
466  }
467  if (Ingester.getDefault().indexText(stringExtractor, aFile, KeywordSearchIngestModule.this.context)) {
468  putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
469  return true;
470  } else {
471  logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
472  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
473  return false;
474  }
475  } catch (IngesterException ex) {
476  logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
477  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
478  return false;
479  }
480  }
481 
489  private void indexFile(AbstractFile aFile, boolean indexContent) {
490  //logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
491 
492  TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
493 
494  // unallocated and unused blocks can only have strings extracted from them.
495  if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
496  if (context.fileIngestIsCancelled()) {
497  return;
498  }
499  extractStringsAndIndex(aFile);
500  return;
501  }
502 
503  final long size = aFile.getSize();
504  //if not to index content, or a dir, or 0 content, index meta data only
505 
506  if ((indexContent == false || aFile.isDir() || size == 0)) {
507  try {
508  if (context.fileIngestIsCancelled()) {
509  return;
510  }
511  ingester.indexMetaDataOnly(aFile);
512  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
513  } catch (IngesterException ex) {
514  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
515  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
516  }
517  return;
518  }
519 
520  if (context.fileIngestIsCancelled()) {
521  return;
522  }
523  String fileType = fileTypeDetector.getMIMEType(aFile);
524 
525  // we skip archive formats that are opened by the archive module.
526  // @@@ We could have a check here to see if the archive module was enabled though...
527  if (ContentTextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
528  try {
529  if (context.fileIngestIsCancelled()) {
530  return;
531  }
532  ingester.indexMetaDataOnly(aFile);
533  putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
534  } catch (IngesterException ex) {
535  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
536  logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
537  }
538  return;
539  }
540 
541  boolean wasTextAdded = false;
542 
543  //extract text with one of the extractors, divide into chunks and index with Solr
544  try {
545  //logger.log(Level.INFO, "indexing: " + aFile.getName());
546  if (context.fileIngestIsCancelled()) {
547  return;
548  }
549  if (fileType.equals("application/octet-stream")) {
550  extractStringsAndIndex(aFile);
551  return;
552  }
553  if (!extractTextAndIndex(aFile, fileType)) {
554  // Text extractor not found for file. Extract string only.
555  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
556  } else {
557  putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
558  wasTextAdded = true;
559  }
560 
561  } catch (IngesterException e) {
562  logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
563  + aFile.getName(), e);
564  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
565  } catch (Exception e) {
566  logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
567  + aFile.getName(), e);
568  putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
569  }
570 
571  // if it wasn't supported or had an error, default to strings
572  if (wasTextAdded == false) {
573  extractStringsAndIndex(aFile);
574  }
575  }
576  }
577 }
static IndexingServerProperties getMultiUserServerProperties(String caseDirectory)
Definition: Server.java:852
boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat)
static IngestMessage createMessage(MessageType messageType, String source, String subject, String detailsHtml)
void postMessage(final IngestMessage message)
static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status)
static void error(String title, String message)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static IngestMessage createWarningMessage(String source, String subject, String detailsHtml)
static void warn(String title, String message)
static synchronized IngestServices getInstance()
STRINGS_INGESTED
Text was extracted by knowing file type and text_ingested.

Copyright © 2012-2016 Basis Technology. Generated on: Mon Jun 18 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.