Autopsy  4.17.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
Ingester.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.BufferedReader;
22 import java.io.Reader;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.Map;
26 import java.util.Optional;
27 import java.util.logging.Level;
28 import org.apache.commons.lang3.math.NumberUtils;
29 import org.apache.solr.client.solrj.SolrServerException;
30 import org.apache.solr.common.SolrInputDocument;
31 import org.openide.util.NbBundle;
38 import org.sleuthkit.datamodel.AbstractFile;
39 import org.sleuthkit.datamodel.BlackboardArtifact;
40 import org.sleuthkit.datamodel.Content;
41 import org.sleuthkit.datamodel.DerivedFile;
42 import org.sleuthkit.datamodel.Directory;
43 import org.sleuthkit.datamodel.File;
44 import org.sleuthkit.datamodel.LayoutFile;
45 import org.sleuthkit.datamodel.LocalDirectory;
46 import org.sleuthkit.datamodel.LocalFile;
47 import org.sleuthkit.datamodel.Report;
48 import org.sleuthkit.datamodel.SlackFile;
49 import org.sleuthkit.datamodel.SleuthkitItemVisitor;
50 import org.sleuthkit.datamodel.SleuthkitVisitableItem;
51 import org.sleuthkit.datamodel.TskCoreException;
52 
56 //JMTODO: Should this class really be a singleton?
57 class Ingester {
58 
59  private static final Logger logger = Logger.getLogger(Ingester.class.getName());
60  private volatile boolean uncommitedIngests = false;
61  private final Server solrServer = KeywordSearch.getServer();
62  private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();
63  private static Ingester instance;
64  private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
65  = new LanguageSpecificContentIndexingHelper();
66 
67  private Ingester() {
68  }
69 
70  public static synchronized Ingester getDefault() {
71  if (instance == null) {
72  instance = new Ingester();
73  }
74  return instance;
75  }
76 
77  //JMTODO: this is probably useless
78  @Override
79  @SuppressWarnings("FinalizeDeclaration")
80  protected void finalize() throws Throwable {
81  super.finalize();
82 
83  // Warn if files might have been left uncommited.
84  if (uncommitedIngests) {
85  logger.warning("Ingester was used to add files that it never committed."); //NON-NLS
86  }
87  }
88 
99  void indexMetaDataOnly(AbstractFile file) throws IngesterException {
100  indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));
101  }
102 
113  void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {
114  indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));
115  }
116 
125  private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
126  return item.accept(SOLR_FIELDS_VISITOR);
127  }
128 
146  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
147  < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
148  boolean doLanguageDetection = true;
149  return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
150  }
151 
170  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
171  < T extends SleuthkitVisitableItem> boolean indexStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context) throws Ingester.IngesterException {
172  // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.
173  boolean doLanguageDetection = false;
174  return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
175  }
176 
195  // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients
196  private < T extends SleuthkitVisitableItem> boolean indexText(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection) throws Ingester.IngesterException {
197  int numChunks = 0; //unknown until chunking is done
198 
199  Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
200  //Get a reader for the content of the given source
201  try (BufferedReader reader = new BufferedReader(sourceReader)) {
202  Chunker chunker = new Chunker(reader);
203  while (chunker.hasNext()) {
204  if (context != null && context.fileIngestIsCancelled()) {
205  logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
206  return false;
207  }
208 
209  Chunk chunk = chunker.next();
210  Map<String, Object> fields = new HashMap<>(contentFields);
211  String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
212  fields.put(Server.Schema.ID.toString(), chunkId);
213  fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
214  Optional<Language> language = Optional.empty();
215  if (doLanguageDetection) {
216  language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk);
217  language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
218  }
219  try {
220  //add the chunk text to Solr index
221  indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields);
222  // add mini chunk when there's a language specific field
223  if (chunker.hasNext() && language.isPresent()) {
224  languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());
225  }
226  numChunks++;
227  } catch (Ingester.IngesterException ingEx) {
228  logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS
229  + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS
230 
231  throw ingEx; //need to rethrow to signal error and move on
232  }
233  }
234  if (chunker.hasException()) {
235  logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());
236  return false;
237  }
238  } catch (Exception ex) {
239  logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS
240  return false;
241  } finally {
242  if (context != null && context.fileIngestIsCancelled()) {
243  return false;
244  } else {
245  Map<String, Object> fields = new HashMap<>(contentFields);
246  //after all chunks, index just the meta data, including the numChunks, of the parent file
247  fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
248  //reset id field to base document id
249  fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
250  //"parent" docs don't have chunk_size
251  fields.remove(Server.Schema.CHUNK_SIZE.toString());
252  indexChunk(null, null, sourceName, fields);
253  }
254  }
255  return true;
256  }
257 
272  private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {
273  if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
274  //JMTODO: actually if the we couldn't get the image id it is set to -1,
275  // but does this really mean we don't want to index it?
276 
277  //skip the file, image id unknown
278  String msg = NbBundle.getMessage(Ingester.class,
279  "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?
280  logger.log(Level.SEVERE, msg);
281  throw new IngesterException(msg);
282  }
283 
284  //Make a SolrInputDocument out of the field map
285  SolrInputDocument updateDoc = new SolrInputDocument();
286  for (String key : fields.keySet()) {
287  if (fields.get(key).getClass() == String.class) {
288  updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());
289  } else {
290  updateDoc.addField(key, fields.get(key));
291  }
292  }
293 
294  try {
295  //TODO: consider timeout thread, or vary socket timeout based on size of indexed content
296 
297  //add the content to the SolrInputDocument
298  //JMTODO: can we just add it to the field map before passing that in?
299  updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
300 
301  // We also add the content (if present) in lowercase form to facilitate case
302  // insensitive substring/regular expression search.
303  double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
304  if (indexSchemaVersion >= 2.1) {
305  updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));
306  }
307 
308  TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");
309 
310  solrServer.addDocument(updateDoc);
311  HealthMonitor.submitTimingMetric(metric);
312  uncommitedIngests = true;
313 
314  } catch (KeywordSearchModuleException | NoOpenCoreException ex) {
315  //JMTODO: does this need to be internationalized?
316  throw new IngesterException(
317  NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);
318  }
319  }
320 
325  void commit() {
326  try {
327  solrServer.commit();
328  uncommitedIngests = false;
329  } catch (NoOpenCoreException | SolrServerException ex) {
330  logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
331 
332  }
333  }
334 
338  static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
339 
340  @Override
341  protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {
342  return new HashMap<>();
343  }
344 
345  @Override
346  public Map<String, String> visit(File f) {
347  return getCommonAndMACTimeFields(f);
348  }
349 
350  @Override
351  public Map<String, String> visit(DerivedFile df) {
352  return getCommonAndMACTimeFields(df);
353  }
354 
355  @Override
356  public Map<String, String> visit(Directory d) {
357  return getCommonAndMACTimeFields(d);
358  }
359 
360  @Override
361  public Map<String, String> visit(LocalDirectory ld) {
362  return getCommonAndMACTimeFields(ld);
363  }
364 
365  @Override
366  public Map<String, String> visit(LayoutFile lf) {
367  // layout files do not have times
368  return getCommonFields(lf);
369  }
370 
371  @Override
372  public Map<String, String> visit(LocalFile lf) {
373  return getCommonAndMACTimeFields(lf);
374  }
375 
376  @Override
377  public Map<String, String> visit(SlackFile f) {
378  return getCommonAndMACTimeFields(f);
379  }
380 
390  private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {
391  Map<String, String> params = getCommonFields(file);
392  params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
393  params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
394  params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file));
395  params.put(Server.Schema.CRTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCrtime(), file));
396  return params;
397  }
398 
407  private Map<String, String> getCommonFields(AbstractFile file) {
408  Map<String, String> params = new HashMap<>();
409  params.put(Server.Schema.ID.toString(), Long.toString(file.getId()));
410  try {
411  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));
412  } catch (TskCoreException ex) {
413  logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + file.getId(), ex); //NON-NLS
414  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
415  }
416  params.put(Server.Schema.FILE_NAME.toString(), file.getName().toLowerCase());
417  return params;
418  }
419 
427  @Override
428  public Map<String, String> visit(BlackboardArtifact artifact) {
429  Map<String, String> params = new HashMap<>();
430  params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
431  try {
432  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
433  } catch (TskCoreException ex) {
434  logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS
435  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
436  }
437  return params;
438  }
439 
447  @Override
448  public Map<String, String> visit(Report report) {
449  Map<String, String> params = new HashMap<>();
450  params.put(Server.Schema.ID.toString(), Long.toString(report.getId()));
451  try {
452  Content dataSource = report.getDataSource();
453  if (null == dataSource) {
454  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
455  } else {
456  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
457  }
458  } catch (TskCoreException ex) {
459  logger.log(Level.SEVERE, "Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex); //NON-NLS
460  params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
461  }
462  return params;
463  }
464  }
465 
470  static class IngesterException extends Exception {
471 
472  private static final long serialVersionUID = 1L;
473 
474  IngesterException(String message, Throwable ex) {
475  super(message, ex);
476  }
477 
478  IngesterException(String message) {
479  super(message);
480  }
481  }
482 }
Map< String, String > visit(LocalDirectory ld)
Definition: Ingester.java:361
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Definition: Ingester.java:390
static String getStringTimeISO8601(long epochSeconds, TimeZone tzone)
Map< String, String > getCommonFields(AbstractFile file)
Definition: Ingester.java:407
Map< String, String > visit(BlackboardArtifact artifact)
Definition: Ingester.java:428
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)
Definition: Ingester.java:341

Copyright © 2012-2021 Basis Technology. Generated on: Tue Jan 19 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.