api-docs/4.20.0/_ingester_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2011-2021 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import java.io.BufferedReader;

 import java.io.IOException;

 import java.io.InputStream;

 import java.io.InputStreamReader;

 import java.io.Reader;

 import java.util.ArrayList;

 import java.util.Collections;

 import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

 import java.util.Optional;

 import java.util.logging.Level;

 import org.apache.commons.lang3.math.NumberUtils;

 import org.apache.solr.client.solrj.SolrServerException;

 import org.apache.solr.common.SolrInputDocument;

 import org.openide.util.NbBundle;

 import org.openide.util.io.ReaderInputStream;

 import org.sleuthkit.autopsy.coreutils.Logger;

 import org.sleuthkit.autopsy.coreutils.TimeZoneUtils;

 import org.sleuthkit.autopsy.healthmonitor.HealthMonitor;

 import org.sleuthkit.autopsy.healthmonitor.TimingMetric;

 import org.sleuthkit.autopsy.ingest.IngestJobContext;

 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;

 import org.sleuthkit.datamodel.AbstractFile;

 import org.sleuthkit.datamodel.BlackboardArtifact;

 import org.sleuthkit.datamodel.Content;

 import org.sleuthkit.datamodel.DerivedFile;

 import org.sleuthkit.datamodel.Directory;

 import org.sleuthkit.datamodel.File;

 import org.sleuthkit.datamodel.LayoutFile;

 import org.sleuthkit.datamodel.LocalDirectory;

 import org.sleuthkit.datamodel.LocalFile;

 import org.sleuthkit.datamodel.Report;

 import org.sleuthkit.datamodel.SlackFile;

 import org.sleuthkit.datamodel.SleuthkitItemVisitor;

 import org.sleuthkit.datamodel.SleuthkitVisitableItem;

 import org.sleuthkit.datamodel.TskCoreException;


 //JMTODO: Should this class really be a singleton?

 class Ingester {


     private static final Logger logger = Logger.getLogger(Ingester.class.getName());

     private volatile boolean uncommitedIngests = false;

     private final Server solrServer = KeywordSearch.getServer();

     private static final SolrFieldsVisitor SOLR_FIELDS_VISITOR = new SolrFieldsVisitor();

     private static Ingester instance;

     private final LanguageSpecificContentIndexingHelper languageSpecificContentIndexingHelper

         = new LanguageSpecificContentIndexingHelper();

     private static final int LANGUAGE_DETECTION_STRING_SIZE = 4096;


     private Ingester() {

     }


     public static synchronized Ingester getDefault() {

         if (instance == null) {

             instance = new Ingester();

         }

         return instance;

     }


     //JMTODO: this is probably useless

     @Override

     @SuppressWarnings("FinalizeDeclaration")

     protected void finalize() throws Throwable {

         super.finalize();


         // Warn if files might have been left uncommited.

         if (uncommitedIngests) {

             logger.warning("Ingester was used to add files that it never committed."); //NON-NLS

         }

     }


     void indexMetaDataOnly(AbstractFile file) throws IngesterException {

         indexChunk("", "", file.getName().toLowerCase(), new HashMap<>(getContentFields(file)));

     }


     void indexMetaDataOnly(BlackboardArtifact artifact, String sourceName) throws IngesterException {

         indexChunk("", "", sourceName, new HashMap<>(getContentFields(artifact)));

     }


     private Map<String, String> getContentFields(SleuthkitVisitableItem item) {

         return item.accept(SOLR_FIELDS_VISITOR);

     }


     // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients

 //    < T extends SleuthkitVisitableItem> boolean search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context,  boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {

 //        boolean doLanguageDetection = true;

 //        return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection,  indexIntoSolr, keywordListNames);

 //    }


     // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients

 //    < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context,  boolean indexIntoSolr) throws Ingester.IngesterException {

 //        // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.

 //        boolean doLanguageDetection = false;

 //        return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, null);

 //    }

 //

 //    < T extends SleuthkitVisitableItem> boolean searchStrings(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context,  boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException {

 //        // Per JIRA-7100, it was determined that language detection on extracted strings can take a really long time.

 //        boolean doLanguageDetection = false;

 //        return search(sourceReader, sourceID, sourceName, source, context, doLanguageDetection, indexIntoSolr, keywordListNames);

 //    }


     // TODO (JIRA-3118): Cancelled text indexing does not propagate cancellation to clients

     < T extends SleuthkitVisitableItem> void search(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection, boolean indexIntoSolr, List<String> keywordListNames) throws Ingester.IngesterException, IOException, TskCoreException, Exception {

         int numChunks = 0; //unknown until chunking is done

         Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));

         Optional<Language> language = Optional.empty();

         InlineSearcher searcher = new InlineSearcher(keywordListNames, context);

         List<Chunk> activeChunkList = new ArrayList<>();

         boolean fileIndexed = false;


         //Get a reader for the content of the given source

         try (BufferedReader reader = new BufferedReader(sourceReader)) {

             Chunker chunker = new Chunker(reader);

             String name = sourceName;

             if(!(source instanceof BlackboardArtifact)) {

                 searcher.searchString(name, sourceID, 0);

             }


             while (chunker.hasNext()) {

                 if ( context.fileIngestIsCancelled()) {

                     logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);

                     return;

                 }


                 Chunk chunk = chunker.next();

                 chunk.setChunkId(numChunks+1);


                 if (doLanguageDetection) {

                     int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);

                     language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));


                     // only do language detection on the first chunk of the document

                     doLanguageDetection = false;

                 }


                 if(keywordListNames != null) {

                     boolean hitFoundInChunk = searcher.searchChunk(chunk, sourceID, numChunks);

                     if(!indexIntoSolr) {

                         if(!hitFoundInChunk) {

                             if(!activeChunkList.isEmpty() ) {

                                 if(activeChunkList.get(activeChunkList.size() - 1).hasHit()) {

                                     activeChunkList.add(chunk);

                                     // Write List

                                     for(Chunk c: activeChunkList) {

                                         indexChunk(c, sourceID, sourceName, language, contentFields, chunker.hasNext());

                                     }

                                     activeChunkList.clear();

                                 } else {

                                     activeChunkList.clear();

                                     activeChunkList.add(chunk);

                                 }

                             } else {

                                 activeChunkList.add(chunk);

                             }

                         } else {

                             fileIndexed = true;

                             chunk.setHasHit(true);

                             activeChunkList.add(chunk);

                         }

                     } else {

                         indexChunk(chunk, sourceID, sourceName, language, contentFields, chunker.hasNext());

                         fileIndexed = true;

                     }

                 }


                 numChunks++;


             }


             if(activeChunkList.size() > 1 || (activeChunkList.size() == 1 && activeChunkList.get(0).hasHit())) {

                 for(Chunk c: activeChunkList) {

                     indexChunk(c, sourceID, sourceName, language, contentFields, true);

                 }

             }


             if (chunker.hasException()) {

                 logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());

                 throw chunker.getException();

             }


         } finally {

             if (context.fileIngestIsCancelled()) {

                 return ;

             }


             if (fileIndexed) {

                 Map<String, Object> fields = new HashMap<>(contentFields);

                 //after all chunks, index just the meta data, including the  numChunks, of the parent file

                 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));

                 //reset id field to base document id

                 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));

                 //"parent" docs don't have chunk_size

                 fields.remove(Server.Schema.CHUNK_SIZE.toString());

                 indexChunk(null, null, sourceName, fields);

             }

         }

     }


     < T extends SleuthkitVisitableItem> boolean indexFile(Reader sourceReader, long sourceID, String sourceName, T source, IngestJobContext context, boolean doLanguageDetection) throws Ingester.IngesterException {

         int numChunks = 0; //unknown until chunking is done

         Map<String, String> contentFields = Collections.unmodifiableMap(getContentFields(source));

         Optional<Language> language = Optional.empty();

         //Get a reader for the content of the given source

         try (BufferedReader reader = new BufferedReader(sourceReader)) {

             Chunker chunker = new Chunker(reader);

             while (chunker.hasNext()) {

                 if ( context.fileIngestIsCancelled()) {

                     logger.log(Level.INFO, "File ingest cancelled. Cancelling keyword search indexing of {0}", sourceName);

                     return false;

                 }


                 Chunk chunk = chunker.next();


                 if (doLanguageDetection) {

                     int size = Math.min(chunk.getBaseChunkLength(), LANGUAGE_DETECTION_STRING_SIZE);

                     language = languageSpecificContentIndexingHelper.detectLanguageIfNeeded(chunk.toString().substring(0, size));


                     // only do language detection on the first chunk of the document

                     doLanguageDetection = false;

                 }


                 Map<String, Object> fields = new HashMap<>(contentFields);

                 String chunkId = Server.getChunkIdString(sourceID, numChunks + 1);

                 fields.put(Server.Schema.ID.toString(), chunkId);

                 fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));


                 language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));

                 try {

                     //add the chunk text to Solr index

                     indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);

                     // add mini chunk when there's a language specific field

                     if (chunker.hasNext() && language.isPresent()) {

                         languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());

                     }

                      numChunks++;


                 } catch (Ingester.IngesterException ingEx) {

                     logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS

                             + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS


                     throw ingEx; //need to rethrow to signal error and move on

                 }

             }

             if (chunker.hasException()) {

                 logger.log(Level.WARNING, "Error chunking content from " + sourceID + ": " + sourceName, chunker.getException());

                 return false;

             }


         } catch (Exception ex) {

             logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceID + ": " + sourceName, ex);//NON-NLS

             return false;

         } finally {

             if (context.fileIngestIsCancelled()) {

                 return false;

             } else  {

                 Map<String, Object> fields = new HashMap<>(contentFields);

                 //after all chunks, index just the meta data, including the  numChunks, of the parent file

                 fields.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(numChunks));

                 //reset id field to base document id

                 fields.put(Server.Schema.ID.toString(), Long.toString(sourceID));

                 //"parent" docs don't have chunk_size

                 fields.remove(Server.Schema.CHUNK_SIZE.toString());

                 indexChunk(null, null, sourceName, fields);

             }

         }


         return true;

     }


     private void indexChunk(Chunk chunk, long sourceID, String sourceName, Optional<Language> language, Map<String, String> contentFields, boolean hasNext) throws IngesterException {

         Map<String, Object> fields = new HashMap<>(contentFields);

         String chunkId = Server.getChunkIdString(sourceID, chunk.getChunkId());

         fields.put(Server.Schema.ID.toString(), chunkId);

         fields.put(Server.Schema.CHUNK_SIZE.toString(), String.valueOf(chunk.getBaseChunkLength()));


         language.ifPresent(lang -> languageSpecificContentIndexingHelper.updateLanguageSpecificFields(fields, chunk, lang));

         try {

             //add the chunk text to Solr index

             indexChunk(chunk.toString(), chunk.getLowerCasedChunk(), sourceName, fields);

             // add mini chunk when there's a language specific field

             if (hasNext && language.isPresent()) {

                 languageSpecificContentIndexingHelper.indexMiniChunk(chunk, sourceName, new HashMap<>(contentFields), chunkId, language.get());

             }


         } catch (Ingester.IngesterException ingEx) {

             logger.log(Level.WARNING, "Ingester had a problem with extracted string from file '" //NON-NLS

                     + sourceName + "' (id: " + sourceID + ").", ingEx);//NON-NLS


             throw ingEx; //need to rethrow to signal error and move on

         }

     }


     private void indexChunk(String chunk, String lowerCasedChunk, String sourceName, Map<String, Object> fields) throws IngesterException {

         if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {

             //JMTODO: actually if the we couldn't get the image id it is set to -1,

             // but does this really mean we don't want to index it?


             //skip the file, image id unknown

             String msg = NbBundle.getMessage(Ingester.class,

                     "Ingester.ingest.exception.unknownImgId.msg", sourceName); //JMTODO: does this need to ne internationalized?

             logger.log(Level.SEVERE, msg);

             throw new IngesterException(msg);

         }


         //Make a SolrInputDocument out of the field map

         SolrInputDocument updateDoc = new SolrInputDocument();

         for (String key : fields.keySet()) {

             if (fields.get(key).getClass() == String.class) {

                 updateDoc.addField(key, Chunker.sanitize((String)fields.get(key)).toString());

             } else {

                 updateDoc.addField(key, fields.get(key));

             }

         }


         try {

             //TODO: consider timeout thread, or vary socket timeout based on size of indexed content


             //add the content to the SolrInputDocument

             //JMTODO: can we just add it to the field map before passing that in?

             updateDoc.addField(Server.Schema.CONTENT.toString(), chunk);


             // We also add the content (if present) in lowercase form to facilitate case

             // insensitive substring/regular expression search.

             double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());

             if (indexSchemaVersion >= 2.1) {

                 updateDoc.addField(Server.Schema.CONTENT_STR.toString(), ((chunk == null) ? "" : lowerCasedChunk));

             }


             TimingMetric metric = HealthMonitor.getTimingMetric("Solr: Index chunk");


             solrServer.addDocument(updateDoc);

             HealthMonitor.submitTimingMetric(metric);

             uncommitedIngests = true;


         } catch (KeywordSearchModuleException | NoOpenCoreException ex) {

             //JMTODO: does this need to be internationalized?

             throw new IngesterException(

                     NbBundle.getMessage(Ingester.class, "Ingester.ingest.exception.err.msg", sourceName), ex);

         }

     }


     void commit() {

         try {

             solrServer.commit();

             uncommitedIngests = false;

         } catch (NoOpenCoreException | SolrServerException ex) {

             logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS


         }

     }


     static private class SolrFieldsVisitor extends SleuthkitItemVisitor.Default<Map<String, String>> {


         @Override

         protected Map<String, String> defaultVisit(SleuthkitVisitableItem svi) {

             return new HashMap<>();

         }


         @Override

         public Map<String, String> visit(File f) {

             return getCommonAndMACTimeFields(f);

         }


         @Override

         public Map<String, String> visit(DerivedFile df) {

             return getCommonAndMACTimeFields(df);

         }


         @Override

         public Map<String, String> visit(Directory d) {

             return getCommonAndMACTimeFields(d);

         }


         @Override

         public Map<String, String> visit(LocalDirectory ld) {

             return getCommonAndMACTimeFields(ld);

         }


         @Override

         public Map<String, String> visit(LayoutFile lf) {

             // layout files do not have times

             return getCommonFields(lf);

         }


         @Override

         public Map<String, String> visit(LocalFile lf) {

             return getCommonAndMACTimeFields(lf);

         }


         @Override

         public Map<String, String> visit(SlackFile f) {

             return getCommonAndMACTimeFields(f);

         }


         private Map<String, String> getCommonAndMACTimeFields(AbstractFile file) {

             Map<String, String> params = getCommonFields(file);

             params.put(Server.Schema.CTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCtime()));

             params.put(Server.Schema.ATIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getAtime()));

             params.put(Server.Schema.MTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getMtime()));

             params.put(Server.Schema.CRTIME.toString(), TimeZoneUtils.getFormattedTimeISO8601(file.getCrtime()));

             return params;

         }


         private Map<String, String> getCommonFields(AbstractFile file) {

             Map<String, String> params = new HashMap<>();

             params.put(Server.Schema.ID.toString(), Long.toString(file.getId()));

             try {

                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(file.getDataSource().getId()));

             } catch (TskCoreException ex) {

                 logger.log(Level.SEVERE, "Could not get data source id to properly index the file " + file.getId(), ex); //NON-NLS

                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

             }

             params.put(Server.Schema.FILE_NAME.toString(), file.getName().toLowerCase());

             return params;

         }


         @Override

         public Map<String, String> visit(BlackboardArtifact artifact) {

             Map<String, String> params = new HashMap<>();

             params.put(Server.Schema.ID.toString(), Long.toString(artifact.getArtifactID()));

             try {

                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(artifact.getDataSource().getId()));

             } catch (TskCoreException ex) {

                 logger.log(Level.SEVERE, "Could not get data source id to properly index the artifact " + artifact.getArtifactID(), ex); //NON-NLS

                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

             }

             return params;

         }


         @Override

         public Map<String, String> visit(Report report) {

             Map<String, String> params = new HashMap<>();

             params.put(Server.Schema.ID.toString(), Long.toString(report.getId()));

             try {

                 Content dataSource = report.getDataSource();

                 if (null == dataSource) {

                     params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

                 } else {

                     params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSource.getId()));

                 }

             } catch (TskCoreException ex) {

                 logger.log(Level.SEVERE, "Could not get data source id to properly index the report, using default value. Id: " + report.getId(), ex); //NON-NLS

                 params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));

             }

             return params;

         }

     }


     static class IngesterException extends Exception {


         private static final long serialVersionUID = 1L;


         IngesterException(String message, Throwable ex) {

             super(message, ex);

         }


         IngesterException(String message) {

             super(message);

         }

     }

 }

org.sleuthkit

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(Report report)
Definition: Ingester.java:592

org.sleuthkit.autopsy.healthmonitor
Definition: HealthMonitor.java:19

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(LayoutFile lf)
Definition: Ingester.java:510

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(File f)
Definition: Ingester.java:490

org

org.sleuthkit.autopsy.keywordsearch.Server.Schema
Definition: Server.java:113

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(LocalDirectory ld)
Definition: Ingester.java:505

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor
Definition: Ingester.java:482

org.sleuthkit.autopsy.coreutils
Definition: AppSQLiteDB.java:19

org.sleuthkit.autopsy.coreutils.Logger
Definition: Logger.java:36

org.sleuthkit.autopsy.keywordsearch.Chunker
Definition: Chunker.java:41

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.getCommonAndMACTimeFields
Map< String, String > getCommonAndMACTimeFields(AbstractFile file)
Definition: Ingester.java:534

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(SlackFile f)
Definition: Ingester.java:521

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(Directory d)
Definition: Ingester.java:500

org.sleuthkit.autopsy.healthmonitor.TimingMetric
Definition: TimingMetric.java:24

org.sleuthkit.autopsy.ingest
Definition: AnalysisResultIngestModule.java:19

org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk
Definition: Chunker.java:396

org.sleuthkit.autopsy.coreutils.TimeZoneUtils
Definition: TimeZoneUtils.java:36

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.getCommonFields
Map< String, String > getCommonFields(AbstractFile file)
Definition: Ingester.java:551

org.sleuthkit.autopsy.keywordsearch.Server
Definition: Server.java:108

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(DerivedFile df)
Definition: Ingester.java:495

org.sleuthkit.autopsy.ingest.IngestJobContext
Definition: IngestJobContext.java:29

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(BlackboardArtifact artifact)
Definition: Ingester.java:572

org.sleuthkit.autopsy.coreutils.TimeZoneUtils.getFormattedTimeISO8601
static String getFormattedTimeISO8601(long epochTime)
Definition: TimeZoneUtils.java:157

org.sleuthkit.autopsy

org.sleuthkit.autopsy.keywordsearch
Definition: AccountsText.java:19

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.visit
Map< String, String > visit(LocalFile lf)
Definition: Ingester.java:516

org.sleuthkit.autopsy.keywordsearch.Ingester.SolrFieldsVisitor.defaultVisit
Map< String, String > defaultVisit(SleuthkitVisitableItem svi)
Definition: Ingester.java:485

org.sleuthkit.autopsy.healthmonitor.HealthMonitor
Definition: HealthMonitor.java:63