api-docs/3.1/_strings_text_extractor_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2011-2014 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import java.io.IOException;

 import java.io.InputStream;

 import java.nio.charset.Charset;

 import java.util.ArrayList;

 import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

 import java.util.logging.Level;

 import org.sleuthkit.autopsy.coreutils.Logger;

 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;

 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;

 import org.sleuthkit.datamodel.AbstractFile;


 class StringsTextExtractor implements TextExtractor {


     private static Ingester ingester;

     private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());

     private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;

     //private static final int BOM_LEN = 3;

     private static final int BOM_LEN = 0;  //disabled prepending of BOM

     private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;

     private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;

     private KeywordSearchIngestModule module;

     private AbstractFile sourceFile;

     private int numChunks = 0;

     private final List<SCRIPT> extractScripts = new ArrayList<>();

     private Map<String, String> extractOptions = new HashMap<>();


     //disabled prepending of BOM

     //static {

     //prepend UTF-8 BOM to start of the buffer

     //stringChunkBuf[0] = (byte) 0xEF;

     //stringChunkBuf[1] = (byte) 0xBB;

     //stringChunkBuf[2] = (byte) 0xBF;

     //}

     public StringsTextExtractor(KeywordSearchIngestModule module) {

         this.module = module;

         ingester = Server.getIngester();

         extractScripts.add(DEFAULT_SCRIPT);

     }


     @Override

     public boolean setScripts(List<SCRIPT> extractScripts) {

         this.extractScripts.clear();

         this.extractScripts.addAll(extractScripts);

         return true;

     }


     @Override

     public List<SCRIPT> getScripts() {

         return new ArrayList<>(extractScripts);

     }


     @Override

     public int getNumChunks() {

         return this.numChunks;

     }


     @Override

     public AbstractFile getSourceFile() {

         return sourceFile;

     }


     @Override

     public Map<String, String> getOptions() {

         return extractOptions;

     }


     @Override

     public void setOptions(Map<String, String> options) {

         this.extractOptions = options;

     }


     @Override

     public boolean index(AbstractFile sourceFile) throws IngesterException {

         this.sourceFile = sourceFile;

         this.numChunks = 0; //unknown until indexing is done

         boolean success = false;


         final boolean extractUTF8 =

                 Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));


         final boolean extractUTF16 =

                 Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));


         if (extractUTF8 == false && extractUTF16 == false) {

             //nothing to do

             return true;

         }


         InputStream stringStream;

         //check which extract stream to use

         if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {

             //optimal for english, english only

             stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);

         } else {

             stringStream = new AbstractFileStringIntStream(

                     sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);

         }


         try {

             success = true;

             //break input stream into chunks


             final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];

             long readSize;

             while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {

                 //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));

                 //debug.write(stringChunkBuf, 0, (int)readSize);


                 AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);


                 try {

                     chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);

                     ++this.numChunks;

                 } catch (IngesterException ingEx) {

                     success = false;

                     logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS

                     throw ingEx; //need to rethrow/return to signal error and move on

                 }


                 //debug.close();

             }


             //after all chunks, ingest the parent file without content itself, and store numChunks

             ingester.ingest(this);


         } catch (IOException ex) {

             logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS

             success = false;

         } finally {

             try {

                 stringStream.close();

             } catch (IOException ex) {

                 logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS

             }

         }


         return success;

     }


     @Override

     public boolean isContentTypeSpecific() {

         return true;

     }


     @Override

     public boolean isSupported(AbstractFile file, String detectedFormat) {

         // strings can be run on anything.

         return true;

     }

 }

org::sleuthkit

org::sleuthkit.autopsy.coreutils.StringExtract
Definition: StringExtract.java:43

org::sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable
Definition: StringExtract.java:662

org

org::sleuthkit.autopsy.coreutils
Definition: AutopsyExceptionHandler.java:20

org::sleuthkit.autopsy.coreutils.Logger
Definition: Logger.java:33

org::sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
Definition: Ingester.java:596

org::sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
Definition: StringExtract.java:672

org::sleuthkit::datamodel::AbstractFile

org::sleuthkit.autopsy.keywordsearch.Ingester
Definition: Ingester.java:65

org::sleuthkit::datamodel

org::sleuthkit.autopsy

org::sleuthkit.autopsy.keywordsearch
Definition: AbstractFileChunk.java:20