19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.util.ArrayList;
27 import java.util.Collections;
28 import java.util.HashMap;
29 import java.util.List;
31 import java.util.Optional;
32 import java.util.logging.Level;
33 import org.apache.commons.lang3.math.NumberUtils;
34 import org.apache.solr.client.solrj.SolrServerException;
35 import org.apache.solr.common.SolrInputDocument;
36 import org.openide.util.NbBundle;
37 import org.openide.util.io.ReaderInputStream;
65 private static final Logger logger = Logger.getLogger(Ingester.class.getName());
66 private volatile boolean uncommitedIngests =
false;
67 private final Server solrServer = KeywordSearch.getServer();
68 private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR =
new SolrFieldsVisitor();
69 private static Ingester instance;
70 private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
71 =
new LanguageSpecificContentIndexingHelper();
72 private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;
77 public static synchronized Ingester getDefault() {
78 if (instance == null) {
79 instance =
new Ingester();
86 @SuppressWarnings(
"FinalizeDeclaration")
87 protected
void finalize() throws Throwable {
91 if (uncommitedIngests) {
92 logger.warning(
"Ingester was used to add files that it never committed.");
106 void indexMetaDataOnly(AbstractFile file)
throws IngesterException {
107 indexChunk(
"",
"", file.getName().toLowerCase(),
new HashMap<>(getContentFields(file)));
120 void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName)
throws IngesterException {
121 indexChunk(
"",
"", sourceName,
new HashMap<>(getContentFields(artifact)));
132 private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
133 return item.accept(SOLR_FIELDS_VISITOR);
209 < T extends SleuthkitVisitableItem>
void search(Reader sourceReader,
long sourceID, String sourceName, T source, IngestJobContext context,
boolean doLanguageDetection,
boolean indexIntoSolr, List<String> keywordListNames)
throws Ingester.IngesterException, IOException, TskCoreException, Exception {
211 Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
212 Optional<Language> language = Optional.empty();
213 InlineSearcher searcher =
new InlineSearcher(keywordListNames, context);
214 List<Chunk> activeChunkList =
new ArrayList<>();
215 boolean fileIndexed =
false;
218 try (BufferedReader reader =
new BufferedReader(sourceReader)) {
219 Chunker chunker =
new Chunker(reader);
220 String name = sourceName;
221 if(!(source instanceof BlackboardArtifact)) {
222 searcher.searchString(name, sourceID, 0);
225 while (chunker.hasNext()) {
226 if ( context.fileIngestIsCancelled()) {
227 logger.log(Level.INFO,
"File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
231 Chunk chunk = chunker.next();
232 chunk.setChunkId(numChunks+1);
234 if (doLanguageDetection) {
235 int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
236 language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
239 doLanguageDetection =
false;
242 if(keywordListNames != null) {
243 boolean hitFoundInChunk = searcher.searchChunk(chunk, sourceID, numChunks);
245 if(!hitFoundInChunk) {
246 if(!activeChunkList.isEmpty() ) {
247 if(activeChunkList.get(activeChunkList.size() - 1).hasHit()) {
248 activeChunkList.add(chunk);
250 for(Chunk c: activeChunkList) {
251 indexChunk(c, sourceID, sourceName, language, contentFields, chunker.hasNext());
253 activeChunkList.clear();
255 activeChunkList.clear();
256 activeChunkList.add(chunk);
259 activeChunkList.add(chunk);
263 chunk.setHasHit(
true);
264 activeChunkList.add(chunk);
267 indexChunk(chunk, sourceID, sourceName, language, contentFields, chunker.hasNext());
276 if(activeChunkList.size() > 1 || (activeChunkList.size() == 1 && activeChunkList.get(0).hasHit())) {
277 for(Chunk c: activeChunkList) {
278 indexChunk(c, sourceID, sourceName, language, contentFields,
true);
283 if (chunker.hasException()) {
284 logger.log(Level.WARNING,
"Error chunking content from " + sourceID +
": " + sourceName, chunker.getException());
285 throw chunker.getException();
289 if (context.fileIngestIsCancelled()) {
294 Map<String, Object> fields =
new HashMap<>(contentFields);
296 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
298 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
300 fields.remove(Server.Schema.CHUNK_SIZE.toString());
301 indexChunk(null, null, sourceName, fields);
306 < T extends SleuthkitVisitableItem>
boolean indexFile(Reader sourceReader,
long sourceID, String sourceName, T source, IngestJobContext context,
boolean doLanguageDetection)
throws Ingester.IngesterException {
308 Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
309 Optional<Language> language = Optional.empty();
311 try (BufferedReader reader =
new BufferedReader(sourceReader)) {
312 Chunker chunker =
new Chunker(reader);
313 while (chunker.hasNext()) {
314 if ( context.fileIngestIsCancelled()) {
315 logger.log(Level.INFO,
"File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
319 Chunk chunk = chunker.next();
321 if (doLanguageDetection) {
322 int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
323 language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
326 doLanguageDetection =
false;
329 Map<String, Object> fields =
new HashMap<>(contentFields);
330 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
331 fields.put(Server.Schema.ID.toString(), chunkId);
332 fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
334 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
337 indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);
339 if (chunker.hasNext() && language.isPresent()) {
340 languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName,
new HashMap<>(contentFields), chunkId, language.get());
344 }
catch (Ingester.IngesterException ingEx) {
345 logger.log(Level.WARNING,
"Ingester had a problem with extracted string from file '"
346 + sourceName +
"' (id: " + sourceID +
").", ingEx);
351 if (chunker.hasException()) {
352 logger.log(Level.WARNING,
"Error chunking content from " + sourceID +
": " + sourceName, chunker.getException());
356 }
catch (Exception ex) {
357 logger.log(Level.WARNING,
"Unexpected error, can't read content stream from " + sourceID +
": " + sourceName, ex);
360 if (context.fileIngestIsCancelled()) {
363 Map<String, Object> fields =
new HashMap<>(contentFields);
365 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
367 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
369 fields.remove(Server.Schema.CHUNK_SIZE.toString());
370 indexChunk(null, null, sourceName, fields);
378 private void indexChunk(Chunk chunk,
long sourceID, String sourceName, Optional<Language> language, Map<String, String> contentFields,
boolean hasNext)
throws IngesterException {
379 Map<String, Object> fields =
new HashMap<>(contentFields);
380 String chunkId = Server.getChunkIdString(sourceID, chunk.getChunkId());
381 fields.put(Server.Schema.ID.toString(), chunkId);
382 fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
385 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
388 indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);
390 if (hasNext && language.isPresent()) {
391 languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName,
new HashMap<>(contentFields), chunkId, language.get());
394 }
catch (Ingester.IngesterException ingEx) {
395 logger.log(Level.WARNING,
"Ingester had a problem with extracted string from file '"
396 + sourceName +
"' (id: " + sourceID +
").", ingEx);
416 private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields)
throws IngesterException {
417 if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
422 String msg = NbBundle.getMessage(Ingester.class,
423 "Ingester.ingest.exception.unknownImgId.msg", sourceName);
424 logger.log(Level.SEVERE, msg);
425 throw new IngesterException(msg);
429 SolrInputDocument updateDoc =
new SolrInputDocument();
430 for (String key : fields.keySet()) {
431 if (fields.get(key).getClass() == String.class) {
432 updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());
434 updateDoc.addField(key, fields.get(key));
443 updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
447 double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
448 if (indexSchemaVersion >= 2.1) {
449 updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ?
"" : lowerCasedChunk));
452 TimingMetric metric = HealthMonitor.getTimingMetric(
"Solr: Index chunk");
454 solrServer.addDocument(updateDoc);
455 HealthMonitor.submitTimingMetric(metric);
456 uncommitedIngests =
true;
458 }
catch (KeywordSearchModuleException | NoOpenCoreException ex) {
460 throw new IngesterException(
461 NbBundle.getMessage(Ingester.class,
"Ingester.ingest.exception.err.msg", sourceName), ex);
472 uncommitedIngests =
false;
473 }
catch (NoOpenCoreException | SolrServerException ex) {
474 logger.log(Level.WARNING,
"Error commiting index", ex);
482 static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
485 protected Map<String, String>
defaultVisit(SleuthkitVisitableItem svi) {
486 return new HashMap<>();
490 public Map<String, String>
visit(File f) {
495 public Map<String, String>
visit(DerivedFile df) {
500 public Map<String, String>
visit(Directory d) {
505 public Map<String, String>
visit(LocalDirectory ld) {
510 public Map<String, String>
visit(LayoutFile lf) {
516 public Map<String, String>
visit(LocalFile lf) {
521 public Map<String, String>
visit(SlackFile f) {
552 Map<String, String> params =
new HashMap<>();
553 params.put(
Server.
Schema.ID.toString(), Long.toString(file.getId()));
555 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));
556 }
catch (TskCoreException ex) {
557 logger.log(Level.SEVERE,
"Could not get data source id to properly index the file " + file.getId(), ex);
558 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
560 params.put(
Server.
Schema.FILE_NAME.toString(), file.getName().toLowerCase());
572 public Map<String, String>
visit(BlackboardArtifact artifact) {
573 Map<String, String> params =
new HashMap<>();
574 params.put(
Server.
Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
576 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
577 }
catch (TskCoreException ex) {
578 logger.log(Level.SEVERE,
"Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex);
579 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
592 public Map<String, String>
visit(Report report) {
593 Map<String, String> params =
new HashMap<>();
594 params.put(
Server.
Schema.ID.toString(), Long.toString(report.getId()));
596 Content dataSource = report.getDataSource();
597 if (null == dataSource) {
598 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
600 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
602 }
catch (TskCoreException ex) {
603 logger.log(Level.SEVERE,
"Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex);
604 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
614 static class IngesterException
extends Exception {
616 private static final long serialVersionUID = 1L;
618 IngesterException(String message, Throwable ex) {
622 IngesterException(String message) {
Map< String, String > visit(Report report)
Map< String, String > visit(LayoutFile lf)
Map< String, String > visit(File f)
Map< String, String > visit(LocalDirectory ld)
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Map< String, String > visit(SlackFile f)
Map< String, String > visit(Directory d)
Map< String, String > getCommonFields(AbstractFile file)
Map< String, String > visit(DerivedFile df)
Map< String, String > visit(BlackboardArtifact artifact)
static String getFormattedTimeISO8601(long epochTime)
Map< String, String > visit(LocalFile lf)
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)