19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.BufferedReader;
 
   22 import java.io.Reader;
 
   23 import java.util.Collections;
 
   24 import java.util.HashMap;
 
   26 import java.util.Optional;
 
   27 import java.util.logging.Level;
 
   28 import org.apache.commons.lang3.math.NumberUtils;
 
   29 import org.apache.solr.client.solrj.SolrServerException;
 
   30 import org.apache.solr.common.SolrInputDocument;
 
   31 import org.openide.util.NbBundle;
 
   60     private static final Logger logger = Logger.getLogger(Ingester.class.getName());
 
   61     private volatile boolean uncommitedIngests = 
false;
 
   62     private final Server solrServer = KeywordSearch.getServer();
 
   63     private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = 
new SolrFieldsVisitor();
 
   64     private static Ingester instance;
 
   65     private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper
 
   66         = 
new LanguageSpecificContentIndexingHelper();
 
   67     private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;
 
   72     public static synchronized Ingester getDefault() {
 
   73         if (instance == null) {
 
   74             instance = 
new Ingester();
 
   81     @SuppressWarnings(
"FinalizeDeclaration")
 
   82     protected 
void finalize() throws Throwable {
 
   86         if (uncommitedIngests) {
 
   87             logger.warning(
"Ingester was used to add files that it never committed."); 
 
  101     void indexMetaDataOnly(AbstractFile file) 
throws IngesterException {
 
  102         indexChunk(
"", 
"", file.getName().toLowerCase(), 
new HashMap<>(getContentFields(file)));
 
  115     void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) 
throws IngesterException {
 
  116         indexChunk(
"", 
"", sourceName, 
new HashMap<>(getContentFields(artifact)));
 
  127     private Map<String, String> getContentFields(SleuthkitVisitableItem item) {
 
  128         return item.accept(SOLR_FIELDS_VISITOR);
 
  149     < T extends SleuthkitVisitableItem> 
boolean indexText(Reader sourceReader, 
long sourceID, String sourceName, T source, IngestJobContext context) 
throws Ingester.IngesterException {
 
  150         boolean doLanguageDetection = 
true;
 
  151         return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
 
  173     < T extends SleuthkitVisitableItem> 
boolean indexStrings(Reader sourceReader, 
long sourceID, String sourceName, T source, IngestJobContext context) 
throws Ingester.IngesterException {
 
  175         boolean doLanguageDetection = 
false;
 
  176         return indexText(sourceReader, sourceID, sourceName, source, context, doLanguageDetection);
 
  198     private < T extends SleuthkitVisitableItem> 
boolean indexText(Reader sourceReader, 
long sourceID, String sourceName, T source, IngestJobContext context, 
boolean doLanguageDetection) 
throws Ingester.IngesterException {
 
  201         Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));
 
  202         Optional<Language> language = Optional.empty();
 
  204         try (BufferedReader reader = 
new BufferedReader(sourceReader)) {
 
  205             Chunker chunker = 
new Chunker(reader);
 
  206             while (chunker.hasNext()) {
 
  207                 if (context != null && context.fileIngestIsCancelled()) {
 
  208                     logger.log(Level.INFO, 
"File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);
 
  212                 Chunk chunk = chunker.next();
 
  213                 Map<String, Object> fields = 
new HashMap<>(contentFields);
 
  214                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);
 
  215                 fields.put(Server.Schema.ID.toString(), chunkId);
 
  216                 fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));
 
  218                 if (doLanguageDetection) {
 
  219                     int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);
 
  220                     language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));
 
  223                     doLanguageDetection = 
false;
 
  225                 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));
 
  228                     indexChunk(chunk.toString(), chunk.geLowerCasedChunk(), sourceName, fields);
 
  230                     if (chunker.hasNext() && language.isPresent()) {
 
  231                         languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, 
new HashMap<>(contentFields), chunkId, language.get());
 
  234                 } 
catch (Ingester.IngesterException ingEx) {
 
  235                     logger.log(Level.WARNING, 
"Ingester had a problem with extracted string from file '"  
  236                             + sourceName + 
"' (id: " + sourceID + 
").", ingEx);
 
  241             if (chunker.hasException()) {
 
  242                 logger.log(Level.WARNING, 
"Error chunking content from " + sourceID + 
": " + sourceName, chunker.getException());
 
  245         } 
catch (Exception ex) {
 
  246             logger.log(Level.WARNING, 
"Unexpected error, can't read content stream from " + sourceID + 
": " + sourceName, ex);
 
  249             if (context != null && context.fileIngestIsCancelled()) {
 
  252                 Map<String, Object> fields = 
new HashMap<>(contentFields);
 
  254                 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));
 
  256                 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));
 
  258                 fields.remove(Server.Schema.CHUNK_SIZE.toString());
 
  259                 indexChunk(null, null, sourceName, fields);
 
  279     private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) 
throws IngesterException {
 
  280         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
 
  285             String msg = NbBundle.getMessage(Ingester.class,
 
  286                     "Ingester.ingest.exception.unknownImgId.msg", sourceName); 
 
  287             logger.log(Level.SEVERE, msg);
 
  288             throw new IngesterException(msg);
 
  292         SolrInputDocument updateDoc = 
new SolrInputDocument();
 
  293         for (String key : fields.keySet()) {
 
  294             if (fields.get(key).getClass() == String.class) {
 
  295                 updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());
 
  297                 updateDoc.addField(key, fields.get(key));
 
  306             updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);
 
  310             double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
 
  311             if (indexSchemaVersion >= 2.1) {
 
  312                 updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? 
"" : lowerCasedChunk));
 
  315             TimingMetric metric = HealthMonitor.getTimingMetric(
"Solr: Index chunk");
 
  317             solrServer.addDocument(updateDoc);
 
  318             HealthMonitor.submitTimingMetric(metric);
 
  319             uncommitedIngests = 
true;
 
  321         } 
catch (KeywordSearchModuleException | NoOpenCoreException ex) {
 
  323             throw new IngesterException(
 
  324                     NbBundle.getMessage(Ingester.class, 
"Ingester.ingest.exception.err.msg", sourceName), ex);
 
  335             uncommitedIngests = 
false;
 
  336         } 
catch (NoOpenCoreException | SolrServerException ex) {
 
  337             logger.log(Level.WARNING, 
"Error commiting index", ex); 
 
  345     static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {
 
  348         protected Map<String, String> 
defaultVisit(SleuthkitVisitableItem svi) {
 
  349             return new HashMap<>();
 
  353         public Map<String, String> 
visit(File f) {
 
  358         public Map<String, String> 
visit(DerivedFile df) {
 
  363         public Map<String, String> 
visit(Directory d) {
 
  368         public Map<String, String> 
visit(LocalDirectory ld) {
 
  373         public Map<String, String> 
visit(LayoutFile lf) {
 
  379         public Map<String, String> 
visit(LocalFile lf) {
 
  384         public Map<String, String> 
visit(SlackFile f) {
 
  415             Map<String, String> params = 
new HashMap<>();
 
  416             params.put(
Server.
Schema.ID.toString(), Long.toString(file.getId()));
 
  418                 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));
 
  419             } 
catch (TskCoreException ex) {
 
  420                 logger.log(Level.SEVERE, 
"Could not get data source id to properly index the file " + file.getId(), ex); 
 
  421                 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
 
  423             params.put(
Server.
Schema.FILE_NAME.toString(), file.getName().toLowerCase());
 
  435         public Map<String, String> 
visit(BlackboardArtifact artifact) {
 
  436             Map<String, String> params = 
new HashMap<>();
 
  437             params.put(
Server.
Schema.ID.toString(), Long.toString(artifact.getArtifactID()));
 
  439                 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));
 
  440             } 
catch (TskCoreException ex) {
 
  441                 logger.log(Level.SEVERE, 
"Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); 
 
  442                 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
 
  455         public Map<String, String> 
visit(Report report) {
 
  456             Map<String, String> params = 
new HashMap<>();
 
  457             params.put(
Server.
Schema.ID.toString(), Long.toString(report.getId()));
 
  459                 Content dataSource = report.getDataSource();
 
  460                 if (null == dataSource) {
 
  461                     params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
 
  463                     params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));
 
  465             } 
catch (TskCoreException ex) {
 
  466                 logger.log(Level.SEVERE, 
"Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex); 
 
  467                 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
 
  477     static class IngesterException 
extends Exception {
 
  479         private static final long serialVersionUID = 1L;
 
  481         IngesterException(String message, Throwable ex) {
 
  485         IngesterException(String message) {
 
Map< String, String > visit(Report report)
 
Map< String, String > visit(LayoutFile lf)
 
Map< String, String > visit(File f)
 
Map< String, String > visit(LocalDirectory ld)
 
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
 
Map< String, String > visit(SlackFile f)
 
Map< String, String > visit(Directory d)
 
Map< String, String > getCommonFields(AbstractFile file)
 
Map< String, String > visit(DerivedFile df)
 
Map< String, String > visit(BlackboardArtifact artifact)
 
static String getFormattedTimeISO8601(long epochTime)
 
Map< String, String > visit(LocalFile lf)
 
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)