Autopsy  4.20.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
Ingester.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.util.ArrayList;
27 import java.util.Collections;
28 import java.util.HashMap;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.Optional;
32 import java.util.logging.Level;
33 import org.apache.commons.lang3.math.NumberUtils;
34 import org.apache.solr.client.solrj.SolrServerException;
35 import org.apache.solr.common.SolrInputDocument;
36 import org.openide.util.NbBundle;
37 import org.openide.util.io.ReaderInputStream;
44 import org.sleuthkit.datamodel.AbstractFile;
45 import org.sleuthkit.datamodel.BlackboardArtifact;
46 import org.sleuthkit.datamodel.Content;
47 import org.sleuthkit.datamodel.DerivedFile;
48 import org.sleuthkit.datamodel.Directory;
49 import org.sleuthkit.datamodel.File;
50 import org.sleuthkit.datamodel.LayoutFile;
51 import org.sleuthkit.datamodel.LocalDirectory;
52 import org.sleuthkit.datamodel.LocalFile;
53 import org.sleuthkit.datamodel.Report;
54 import org.sleuthkit.datamodel.SlackFile;
55 import org.sleuthkit.datamodel.SleuthkitItemVisitor;
56 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
57 import org.sleuthkit.datamodel.TskCoreException;
58 
62 //JMTODO: Should this class really be a singleton?
63 class Ingester {
64 
65  private static final Logger logger = Logger.getLogger(Ingester.class.getName());
66  private volatile boolean uncommitedIngests = false;
67  private final Server solrServer = KeywordSearch.getServer();
68  private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
69  private static Ingester instance;
70  private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
71  = new LanguageSpecificContentIndexingHelper();
72  private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;
73 
74  private Ingester() {
75  }
76 
77  public static synchronized Ingester getDefault() {
78  if (instance == null) {
79  instance = new Ingester();
80  }
81  return instance;
82  }
83 
84  //JMTODO: this is probably useless
85  @Override
86  @SuppressWarnings("FinalizeDeclaration")
87  protected void finalize() throws Throwable {
88  super.finalize();
89 
90  // Warn if files might have been left uncommited.
91  if (uncommitedIngests) {
92  logger.warning("Ingester was used to add files that it never committed."); //NON-NLS
93  }
94  }
95 
106  void indexMetaDataOnly(AbstractFile file) throws IngesterException {
107  indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
108  }
109 
120  void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
121  indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));
122  }
123 
132  private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
133  return item.accept(SOLR_FIELDS_VISITOR);
134  }
135 
153  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
154 // < T extends SleuthkitVisitableItem> boolean search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {
155 // boolean doLanguageDetection = true;
156 // return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, keywordListNames);
157 // }
158 
177  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
178 // < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean indexIntoSolr) throws Ingester.IngesterException {
179 // // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.
180 // boolean doLanguageDetection = false;
181 // return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, null);
182 // }
183 //
184 // < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {
185 // // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.
186 // boolean doLanguageDetection = false;
187 // return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, keywordListNames);
188 // }
189 
208  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
209  < T extends SleuthkitVisitableItem> void search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException, IOException, TskCoreException, Exception {
210  int numChunks = 0; //unknown until chunking is done
211  Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
212  Optional<Language> language = Optional.empty();
213  InlineSearcher searcher = new InlineSearcher(keywordListNames, context);
214  List<Chunk> activeChunkList = new ArrayList<>();
215  boolean fileIndexed = false;
216 
217  //Get a reader for the content of the given source
218  try (BufferedReader reader = new BufferedReader(sourceReader)) {
219  Chunker chunker = new Chunker(reader);
220  String name = sourceName;
221  if(!(source instanceof BlackboardArtifact)) {
222  searcher.searchString(name, sourceID, 0);
223  }
224 
225  while (chunker.hasNext()) {
226  if ( context.fileIngestIsCancelled()) {
227  logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
228  return;
229  }
230 
231  Chunk chunk = chunker.next();
232  chunk.setChunkId(numChunks+1);
233 
234  if (doLanguageDetection) {
235  int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
236  language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
237 
238  // only do language detection on the first chunk of the document
239  doLanguageDetection = false;
240  }
241 
242  if(keywordListNames != null) {
243  boolean hitFoundInChunk = searcher.searchChunk(chunk, sourceID, numChunks);
244  if(!indexIntoSolr) {
245  if(!hitFoundInChunk) {
246  if(!activeChunkList.isEmpty() ) {
247  if(activeChunkList.get(activeChunkList.size() - 1).hasHit()) {
248  activeChunkList.add(chunk);
249  // Write List
250  for(Chunk c: activeChunkList) {
251  indexChunk(c, sourceID, sourceName, language, contentFields, chunker.hasNext());
252  }
253  activeChunkList.clear();
254  } else {
255  activeChunkList.clear();
256  activeChunkList.add(chunk);
257  }
258  } else {
259  activeChunkList.add(chunk);
260  }
261  } else {
262  fileIndexed = true;
263  chunk.setHasHit(true);
264  activeChunkList.add(chunk);
265  }
266  } else {
267  indexChunk(chunk, sourceID, sourceName, language, contentFields, chunker.hasNext());
268  fileIndexed = true;
269  }
270  }
271 
272  numChunks++;
273 
274  }
275 
276  if(activeChunkList.size() > 1 || (activeChunkList.size() == 1 && activeChunkList.get(0).hasHit())) {
277  for(Chunk c: activeChunkList) {
278  indexChunk(c, sourceID, sourceName, language, contentFields, true);
279  }
280  }
281 
282 
283  if (chunker.hasException()) {
284  logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
285  throw chunker.getException();
286  }
287 
288  } finally {
289  if (context.fileIngestIsCancelled()) {
290  return ;
291  }
292 
293  if (fileIndexed) {
294  Map<String, Object> fields = new HashMap<>(contentFields);
295  //after all chunks, index just the meta data, including the numChunks, of the parent file
296  fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
297  //reset id field to base document id
298  fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
299  //"parent" docs don't have chunk_size
300  fields.remove(Server.Schema.CHUNK_SIZE.toString());
301  indexChunk(null, null, sourceName, fields);
302  }
303  }
304  }
305 
306  < T extends SleuthkitVisitableItem> boolean indexFile(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection) throws Ingester.IngesterException {
307  int numChunks = 0; //unknown until chunking is done
308  Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
309  Optional<Language> language = Optional.empty();
310  //Get a reader for the content of the given source
311  try (BufferedReader reader = new BufferedReader(sourceReader)) {
312  Chunker chunker = new Chunker(reader);
313  while (chunker.hasNext()) {
314  if ( context.fileIngestIsCancelled()) {
315  logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
316  return false;
317  }
318 
319  Chunk chunk = chunker.next();
320 
321  if (doLanguageDetection) {
322  int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
323  language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
324 
325  // only do language detection on the first chunk of the document
326  doLanguageDetection = false;
327  }
328 
329  Map<String, Object> fields = new HashMap<>(contentFields);
330  String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
331  fields.put(Server.Schema.ID.toString(), chunkId);
332  fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
333 
334  language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
335  try {
336  //add the chunk text to Solr index
337  indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);
338  // add mini chunk when there's a language specific field
339  if (chunker.hasNext() && language.isPresent()) {
340  languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
341  }
342  numChunks++;
343 
344  } catch (Ingester.IngesterException ingEx) {
345  logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
346  + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
347 
348  throw ingEx; //need to rethrow to signal error and move on
349  }
350  }
351  if (chunker.hasException()) {
352  logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
353  return false;
354  }
355 
356  } catch (Exception ex) {
357  logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
358  return false;
359  } finally {
360  if (context.fileIngestIsCancelled()) {
361  return false;
362  } else {
363  Map<String, Object> fields = new HashMap<>(contentFields);
364  //after all chunks, index just the meta data, including the numChunks, of the parent file
365  fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
366  //reset id field to base document id
367  fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
368  //"parent" docs don't have chunk_size
369  fields.remove(Server.Schema.CHUNK_SIZE.toString());
370  indexChunk(null, null, sourceName, fields);
371  }
372  }
373 
374 
375  return true;
376  }
377 
378  private void indexChunk(Chunk chunk, long sourceID, String sourceName, Optional<Language> language, Map<String, String> contentFields, boolean hasNext) throws IngesterException {
379  Map<String, Object> fields = new HashMap<>(contentFields);
380  String chunkId = Server.getChunkIdString(sourceID, chunk.getChunkId());
381  fields.put(Server.Schema.ID.toString(), chunkId);
382  fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
383 
384 
385  language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
386  try {
387  //add the chunk text to Solr index
388  indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);
389  // add mini chunk when there's a language specific field
390  if (hasNext && language.isPresent()) {
391  languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
392  }
393 
394  } catch (Ingester.IngesterException ingEx) {
395  logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
396  + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
397 
398  throw ingEx; //need to rethrow to signal error and move on
399  }
400  }
401 
416  private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {
417  if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
418  //JMTODO: actually if the we couldn't get the image id it is set to -1,
419  // but does this really mean we don't want to index it?
420 
421  //skip the file, image id unknown
422  String msg = NbBundle.getMessage(Ingester.class,
423  "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
424  logger.log(Level.SEVERE, msg);
425  throw new IngesterException(msg);
426  }
427 
428  //Make a SolrInputDocument out of the field map
429  SolrInputDocument updateDoc = new SolrInputDocument();
430  for (String key : fields.keySet()) {
431  if (fields.get(key).getClass() == String.class) {
432  updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());
433  } else {
434  updateDoc.addField(key, fields.get(key));
435  }
436  }
437 
438  try {
439  //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
440 
441  //add the content to the SolrInputDocument
442  //JMTODO: can we just add it to the field map before passing that in?
443  updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
444 
445  // We also add the content (if present) in lowercase form to facilitate case
446  // insensitive substring/regular expression search.
447  double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
448  if (indexSchemaVersion >= 2.1) {
449  updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));
450  }
451 
452  TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
453 
454  solrServer.addDocument(updateDoc);
455  HealthMonitor.submitTimingMetric(metric);
456  uncommitedIngests = true;
457 
458  } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
459  //JMTODO: does this need to be internationalized?
460  throw new IngesterException(
461  NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
462  }
463  }
464 
469  void commit() {
470  try {
471  solrServer.commit();
472  uncommitedIngests = false;
473  } catch (NoOpenCoreException | SolrServerException ex) {
474  logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
475 
476  }
477  }
478 
482  static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
483 
484  @Override
485  protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
486  return new HashMap<>();
487  }
488 
489  @Override
490  public Map<String, String> visit(File f) {
491  return getCommonAndMACTimeFields(f);
492  }
493 
494  @Override
495  public Map<String, String> visit(DerivedFile df) {
496  return getCommonAndMACTimeFields(df);
497  }
498 
499  @Override
500  public Map<String, String> visit(Directory d) {
501  return getCommonAndMACTimeFields(d);
502  }
503 
504  @Override
505  public Map<String, String> visit(LocalDirectory ld) {
506  return getCommonAndMACTimeFields(ld);
507  }
508 
509  @Override
510  public Map<String, String> visit(LayoutFile lf) {
511  // layout files do not have times
512  return getCommonFields(lf);
513  }
514 
515  @Override
516  public Map<String, String> visit(LocalFile lf) {
517  return getCommonAndMACTimeFields(lf);
518  }
519 
520  @Override
521  public Map<String, String> visit(SlackFile f) {
522  return getCommonAndMACTimeFields(f);
523  }
524 
534  private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
535  Map<String, String> params = getCommonFields(file);
536  params.put(Server.Schema.CTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCtime()));
537  params.put(Server.Schema.ATIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getAtime()));
538  params.put(Server.Schema.MTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getMtime()));
539  params.put(Server.Schema.CRTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCrtime()));
540  return params;
541  }
542 
551  private Map<String, String> getCommonFields(AbstractFile file) {
552  Map<String, String> params = new HashMap<>();
553  params.put(Server.Schema.ID.toString(), Long.toString(file.getId()));
554  try {
555  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));
556  } catch (TskCoreException ex) {
557  logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + file.getId(), ex); //NON-NLS
558  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
559  }
560  params.put(Server.Schema.FILE_NAME.toString(), file.getName().toLowerCase());
561  return params;
562  }
563 
571  @Override
572  public Map<String, String> visit(BlackboardArtifact artifact) {
573  Map<String, String> params = new HashMap<>();
574  params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
575  try {
576  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
577  } catch (TskCoreException ex) {
578  logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
579  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
580  }
581  return params;
582  }
583 
591  @Override
592  public Map<String, String> visit(Report report) {
593  Map<String, String> params = new HashMap<>();
594  params.put(Server.Schema.ID.toString(), Long.toString(report.getId()));
595  try {
596  Content dataSource = report.getDataSource();
597  if (null == dataSource) {
598  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
599  } else {
600  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
601  }
602  } catch (TskCoreException ex) {
603  logger.log(Level.SEVERE, "Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex); //NON-NLS
604  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
605  }
606  return params;
607  }
608  }
609 
614  static class IngesterException extends Exception {
615 
616  private static final long serialVersionUID = 1L;
617 
618  IngesterException(String message, Throwable ex) {
619  super(message, ex);
620  }
621 
622  IngesterException(String message) {
623  super(message);
624  }
625  }
626 }
Map< String, String > visit(LocalDirectory ld)
Definition: Ingester.java:505
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Definition: Ingester.java:534
Map< String, String > getCommonFields(AbstractFile file)
Definition: Ingester.java:551
Map< String, String > visit(BlackboardArtifact artifact)
Definition: Ingester.java:572
static String getFormattedTimeISO8601(long epochTime)
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)
Definition: Ingester.java:485

Copyright © 2012-2022 Basis Technology. Generated on: Tue Aug 1 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.