Autopsy  4.19.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
Ingester.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.BufferedReader;
22 import java.io.Reader;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.Map;
26 import java.util.Optional;
27 import java.util.logging.Level;
28 import org.apache.commons.lang3.math.NumberUtils;
29 import org.apache.solr.client.solrj.SolrServerException;
30 import org.apache.solr.common.SolrInputDocument;
31 import org.openide.util.NbBundle;
39 import org.sleuthkit.datamodel.AbstractFile;
40 import org.sleuthkit.datamodel.BlackboardArtifact;
41 import org.sleuthkit.datamodel.Content;
42 import org.sleuthkit.datamodel.DerivedFile;
43 import org.sleuthkit.datamodel.Directory;
44 import org.sleuthkit.datamodel.File;
45 import org.sleuthkit.datamodel.LayoutFile;
46 import org.sleuthkit.datamodel.LocalDirectory;
47 import org.sleuthkit.datamodel.LocalFile;
48 import org.sleuthkit.datamodel.Report;
49 import org.sleuthkit.datamodel.SlackFile;
50 import org.sleuthkit.datamodel.SleuthkitItemVisitor;
51 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
52 import org.sleuthkit.datamodel.TskCoreException;
53 
57 //JMTODO: Should this class really be a singleton?
58 class Ingester {
59 
60  private static final Logger logger = Logger.getLogger(Ingester.class.getName());
61  private volatile boolean uncommitedIngests = false;
62  private final Server solrServer = KeywordSearch.getServer();
63  private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
64  private static Ingester instance;
65  private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
66  = new LanguageSpecificContentIndexingHelper();
67  private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;
68 
69  private Ingester() {
70  }
71 
72  public static synchronized Ingester getDefault() {
73  if (instance == null) {
74  instance = new Ingester();
75  }
76  return instance;
77  }
78 
79  //JMTODO: this is probably useless
80  @Override
81  @SuppressWarnings("FinalizeDeclaration")
82  protected void finalize() throws Throwable {
83  super.finalize();
84 
85  // Warn if files might have been left uncommited.
86  if (uncommitedIngests) {
87  logger.warning("Ingester was used to add files that it never committed."); //NON-NLS
88  }
89  }
90 
101  void indexMetaDataOnly(AbstractFile file) throws IngesterException {
102  indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
103  }
104 
115  void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
116  indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));
117  }
118 
127  private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
128  return item.accept(SOLR_FIELDS_VISITOR);
129  }
130 
148  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
149  < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
150  boolean doLanguageDetection = true;
151  return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
152  }
153 
172  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
173  < T extends SleuthkitVisitableItem> boolean indexStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
174  // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.
175  boolean doLanguageDetection = false;
176  return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
177  }
178 
197  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
198  private < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection) throws Ingester.IngesterException {
199  int numChunks = 0; //unknown until chunking is done
200 
201  Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
202  Optional<Language> language = Optional.empty();
203  //Get a reader for the content of the given source
204  try (BufferedReader reader = new BufferedReader(sourceReader)) {
205  Chunker chunker = new Chunker(reader);
206  while (chunker.hasNext()) {
207  if (context != null && context.fileIngestIsCancelled()) {
208  logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
209  return false;
210  }
211 
212  Chunk chunk = chunker.next();
213  Map<String, Object> fields = new HashMap<>(contentFields);
214  String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
215  fields.put(Server.Schema.ID.toString(), chunkId);
216  fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
217 
218  if (doLanguageDetection) {
219  int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
220  language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
221 
222  // only do language detection on the first chunk of the document
223  doLanguageDetection = false;
224  }
225  language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
226  try {
227  //add the chunk text to Solr index
228  indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields);
229  // add mini chunk when there's a language specific field
230  if (chunker.hasNext() && language.isPresent()) {
231  languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
232  }
233  numChunks++;
234  } catch (Ingester.IngesterException ingEx) {
235  logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
236  + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
237 
238  throw ingEx; //need to rethrow to signal error and move on
239  }
240  }
241  if (chunker.hasException()) {
242  logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
243  return false;
244  }
245  } catch (Exception ex) {
246  logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
247  return false;
248  } finally {
249  if (context != null && context.fileIngestIsCancelled()) {
250  return false;
251  } else {
252  Map<String, Object> fields = new HashMap<>(contentFields);
253  //after all chunks, index just the meta data, including the numChunks, of the parent file
254  fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
255  //reset id field to base document id
256  fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
257  //"parent" docs don't have chunk_size
258  fields.remove(Server.Schema.CHUNK_SIZE.toString());
259  indexChunk(null, null, sourceName, fields);
260  }
261  }
262  return true;
263  }
264 
279  private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {
280  if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
281  //JMTODO: actually if the we couldn't get the image id it is set to -1,
282  // but does this really mean we don't want to index it?
283 
284  //skip the file, image id unknown
285  String msg = NbBundle.getMessage(Ingester.class,
286  "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
287  logger.log(Level.SEVERE, msg);
288  throw new IngesterException(msg);
289  }
290 
291  //Make a SolrInputDocument out of the field map
292  SolrInputDocument updateDoc = new SolrInputDocument();
293  for (String key : fields.keySet()) {
294  if (fields.get(key).getClass() == String.class) {
295  updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());
296  } else {
297  updateDoc.addField(key, fields.get(key));
298  }
299  }
300 
301  try {
302  //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
303 
304  //add the content to the SolrInputDocument
305  //JMTODO: can we just add it to the field map before passing that in?
306  updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
307 
308  // We also add the content (if present) in lowercase form to facilitate case
309  // insensitive substring/regular expression search.
310  double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
311  if (indexSchemaVersion >= 2.1) {
312  updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));
313  }
314 
315  TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
316 
317  solrServer.addDocument(updateDoc);
318  HealthMonitor.submitTimingMetric(metric);
319  uncommitedIngests = true;
320 
321  } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
322  //JMTODO: does this need to be internationalized?
323  throw new IngesterException(
324  NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
325  }
326  }
327 
332  void commit() {
333  try {
334  solrServer.commit();
335  uncommitedIngests = false;
336  } catch (NoOpenCoreException | SolrServerException ex) {
337  logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
338 
339  }
340  }
341 
345  static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
346 
347  @Override
348  protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
349  return new HashMap<>();
350  }
351 
352  @Override
353  public Map<String, String> visit(File f) {
354  return getCommonAndMACTimeFields(f);
355  }
356 
357  @Override
358  public Map<String, String> visit(DerivedFile df) {
359  return getCommonAndMACTimeFields(df);
360  }
361 
362  @Override
363  public Map<String, String> visit(Directory d) {
364  return getCommonAndMACTimeFields(d);
365  }
366 
367  @Override
368  public Map<String, String> visit(LocalDirectory ld) {
369  return getCommonAndMACTimeFields(ld);
370  }
371 
372  @Override
373  public Map<String, String> visit(LayoutFile lf) {
374  // layout files do not have times
375  return getCommonFields(lf);
376  }
377 
378  @Override
379  public Map<String, String> visit(LocalFile lf) {
380  return getCommonAndMACTimeFields(lf);
381  }
382 
383  @Override
384  public Map<String, String> visit(SlackFile f) {
385  return getCommonAndMACTimeFields(f);
386  }
387 
397  private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
398  Map<String, String> params = getCommonFields(file);
399  params.put(Server.Schema.CTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCtime()));
400  params.put(Server.Schema.ATIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getAtime()));
401  params.put(Server.Schema.MTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getMtime()));
402  params.put(Server.Schema.CRTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCrtime()));
403  return params;
404  }
405 
414  private Map<String, String> getCommonFields(AbstractFile file) {
415  Map<String, String> params = new HashMap<>();
416  params.put(Server.Schema.ID.toString(), Long.toString(file.getId()));
417  try {
418  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));
419  } catch (TskCoreException ex) {
420  logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + file.getId(), ex); //NON-NLS
421  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
422  }
423  params.put(Server.Schema.FILE_NAME.toString(), file.getName().toLowerCase());
424  return params;
425  }
426 
434  @Override
435  public Map<String, String> visit(BlackboardArtifact artifact) {
436  Map<String, String> params = new HashMap<>();
437  params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
438  try {
439  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
440  } catch (TskCoreException ex) {
441  logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
442  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
443  }
444  return params;
445  }
446 
454  @Override
455  public Map<String, String> visit(Report report) {
456  Map<String, String> params = new HashMap<>();
457  params.put(Server.Schema.ID.toString(), Long.toString(report.getId()));
458  try {
459  Content dataSource = report.getDataSource();
460  if (null == dataSource) {
461  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
462  } else {
463  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
464  }
465  } catch (TskCoreException ex) {
466  logger.log(Level.SEVERE, "Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex); //NON-NLS
467  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
468  }
469  return params;
470  }
471  }
472 
477  static class IngesterException extends Exception {
478 
479  private static final long serialVersionUID = 1L;
480 
481  IngesterException(String message, Throwable ex) {
482  super(message, ex);
483  }
484 
485  IngesterException(String message) {
486  super(message);
487  }
488  }
489 }
Map< String, String > visit(LocalDirectory ld)
Definition: Ingester.java:368
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Definition: Ingester.java:397
Map< String, String > getCommonFields(AbstractFile file)
Definition: Ingester.java:414
Map< String, String > visit(BlackboardArtifact artifact)
Definition: Ingester.java:435
static String getFormattedTimeISO8601(long epochTime)
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)
Definition: Ingester.java:348

Copyright © 2012-2021 Basis Technology. Generated on: Thu Sep 30 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.