19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.BufferedReader;
22 import java.io.Reader;
23 import java.util.Collections;
24 import java.util.HashMap;
26 import java.util.Optional;
27 import java.util.logging.Level;
28 import org.apache.commons.lang3.math.NumberUtils;
29 import org.apache.solr.client.solrj.SolrServerException;
30 import org.apache.solr.common.SolrInputDocument;
31 import org.openide.util.NbBundle;
60 private static final Logger logger = Logger.getLogger(Ingester.class.getName());
61 private volatile boolean uncommitedIngests =
false;
62 private final Server solrServer = KeywordSearch.getServer();
63 private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR =
new SolrFieldsVisitor();
64 private static Ingester instance;
65 private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
66 =
new LanguageSpecificContentIndexingHelper();
67 private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;
72 public static synchronized Ingester getDefault() {
73 if (instance == null) {
74 instance =
new Ingester();
81 @SuppressWarnings(
"FinalizeDeclaration")
82 protected
void finalize() throws Throwable {
86 if (uncommitedIngests) {
87 logger.warning(
"Ingester was used to add files that it never committed.");
101 void indexMetaDataOnly(AbstractFile file)
throws IngesterException {
102 indexChunk(
"",
"", file.getName().toLowerCase(),
new HashMap<>(getContentFields(file)));
115 void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName)
throws IngesterException {
116 indexChunk(
"",
"", sourceName,
new HashMap<>(getContentFields(artifact)));
127 private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
128 return item.accept(SOLR_FIELDS_VISITOR);
149 < T extends SleuthkitVisitableItem>
boolean indexText(Reader sourceReader,
long sourceID, String sourceName, T source, IngestJobContext context)
throws Ingester.IngesterException {
150 boolean doLanguageDetection =
true;
151 return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
173 < T extends SleuthkitVisitableItem>
boolean indexStrings(Reader sourceReader,
long sourceID, String sourceName, T source, IngestJobContext context)
throws Ingester.IngesterException {
175 boolean doLanguageDetection =
false;
176 return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
198 private < T extends SleuthkitVisitableItem>
boolean indexText(Reader sourceReader,
long sourceID, String sourceName, T source, IngestJobContext context,
boolean doLanguageDetection)
throws Ingester.IngesterException {
201 Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
202 Optional<Language> language = Optional.empty();
204 try (BufferedReader reader =
new BufferedReader(sourceReader)) {
205 Chunker chunker =
new Chunker(reader);
206 while (chunker.hasNext()) {
207 if (context != null && context.fileIngestIsCancelled()) {
208 logger.log(Level.INFO,
"File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
212 Chunk chunk = chunker.next();
213 Map<String, Object> fields =
new HashMap<>(contentFields);
214 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
215 fields.put(Server.Schema.ID.toString(), chunkId);
216 fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
218 if (doLanguageDetection) {
219 int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
220 language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
223 doLanguageDetection =
false;
225 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
228 indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields);
230 if (chunker.hasNext() && language.isPresent()) {
231 languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName,
new HashMap<>(contentFields), chunkId, language.get());
234 }
catch (Ingester.IngesterException ingEx) {
235 logger.log(Level.WARNING,
"Ingester had a problem with extracted string from file '"
236 + sourceName +
"' (id: " + sourceID +
").", ingEx);
241 if (chunker.hasException()) {
242 logger.log(Level.WARNING,
"Error chunking content from " + sourceID +
": " + sourceName, chunker.getException());
245 }
catch (Exception ex) {
246 logger.log(Level.WARNING,
"Unexpected error, can't read content stream from " + sourceID +
": " + sourceName, ex);
249 if (context != null && context.fileIngestIsCancelled()) {
252 Map<String, Object> fields =
new HashMap<>(contentFields);
254 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
256 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
258 fields.remove(Server.Schema.CHUNK_SIZE.toString());
259 indexChunk(null, null, sourceName, fields);
279 private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields)
throws IngesterException {
280 if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
285 String msg = NbBundle.getMessage(Ingester.class,
286 "Ingester.ingest.exception.unknownImgId.msg", sourceName);
287 logger.log(Level.SEVERE, msg);
288 throw new IngesterException(msg);
292 SolrInputDocument updateDoc =
new SolrInputDocument();
293 for (String key : fields.keySet()) {
294 if (fields.get(key).getClass() == String.class) {
295 updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());
297 updateDoc.addField(key, fields.get(key));
306 updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
310 double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
311 if (indexSchemaVersion >= 2.1) {
312 updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ?
"" : lowerCasedChunk));
315 TimingMetric metric = HealthMonitor.getTimingMetric(
"Solr: Index chunk");
317 solrServer.addDocument(updateDoc);
318 HealthMonitor.submitTimingMetric(metric);
319 uncommitedIngests =
true;
321 }
catch (KeywordSearchModuleException | NoOpenCoreException ex) {
323 throw new IngesterException(
324 NbBundle.getMessage(Ingester.class,
"Ingester.ingest.exception.err.msg", sourceName), ex);
335 uncommitedIngests =
false;
336 }
catch (NoOpenCoreException | SolrServerException ex) {
337 logger.log(Level.WARNING,
"Error commiting index", ex);
345 static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
348 protected Map<String, String>
defaultVisit(SleuthkitVisitableItem svi) {
349 return new HashMap<>();
353 public Map<String, String>
visit(File f) {
358 public Map<String, String>
visit(DerivedFile df) {
363 public Map<String, String>
visit(Directory d) {
368 public Map<String, String>
visit(LocalDirectory ld) {
373 public Map<String, String>
visit(LayoutFile lf) {
379 public Map<String, String>
visit(LocalFile lf) {
384 public Map<String, String>
visit(SlackFile f) {
415 Map<String, String> params =
new HashMap<>();
416 params.put(
Server.
Schema.ID.toString(), Long.toString(file.getId()));
418 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));
419 }
catch (TskCoreException ex) {
420 logger.log(Level.SEVERE,
"Could not get data source id to properly index the file " + file.getId(), ex);
421 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
423 params.put(
Server.
Schema.FILE_NAME.toString(), file.getName().toLowerCase());
435 public Map<String, String>
visit(BlackboardArtifact artifact) {
436 Map<String, String> params =
new HashMap<>();
437 params.put(
Server.
Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
439 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
440 }
catch (TskCoreException ex) {
441 logger.log(Level.SEVERE,
"Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex);
442 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
455 public Map<String, String>
visit(Report report) {
456 Map<String, String> params =
new HashMap<>();
457 params.put(
Server.
Schema.ID.toString(), Long.toString(report.getId()));
459 Content dataSource = report.getDataSource();
460 if (null == dataSource) {
461 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
463 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
465 }
catch (TskCoreException ex) {
466 logger.log(Level.SEVERE,
"Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex);
467 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
477 static class IngesterException
extends Exception {
479 private static final long serialVersionUID = 1L;
481 IngesterException(String message, Throwable ex) {
485 IngesterException(String message) {
Map< String, String > visit(Report report)
Map< String, String > visit(LayoutFile lf)
Map< String, String > visit(File f)
Map< String, String > visit(LocalDirectory ld)
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Map< String, String > visit(SlackFile f)
Map< String, String > visit(Directory d)
Map< String, String > getCommonFields(AbstractFile file)
Map< String, String > visit(DerivedFile df)
Map< String, String > visit(BlackboardArtifact artifact)
static String getFormattedTimeISO8601(long epochTime)
Map< String, String > visit(LocalFile lf)
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)