api-docs/4.20.0/_strings_text_extractor_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2011-2019 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.textextractors;


 import java.io.IOException;

 import java.io.InputStream;

 import java.io.InputStreamReader;

 import java.nio.charset.Charset;

 import java.util.ArrayList;

 import java.util.List;

 import java.util.Objects;

 import org.openide.util.Lookup;

 import org.sleuthkit.autopsy.coreutils.StringExtract;

 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;

 import org.sleuthkit.autopsy.textextractors.configs.StringsConfig;

 import org.sleuthkit.datamodel.Content;

 import org.sleuthkit.datamodel.TskCoreException;

 import org.sleuthkit.datamodel.TskException;


 final class StringsTextExtractor implements TextExtractor {


     private boolean extractUTF8;

     private boolean extractUTF16;

     private final Content content;

     private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";


     private final List<SCRIPT> extractScripts = new ArrayList<>();


     public StringsTextExtractor(Content content) {

         //LATIN_2 is the default script

         extractScripts.add(SCRIPT.LATIN_2);

         extractUTF8 = true;

         this.content = content;

     }


     public final void setScripts(List<SCRIPT> extractScripts) {

         if (extractScripts == null) {

             return;

         }


         this.extractScripts.clear();

         this.extractScripts.addAll(extractScripts);

     }


     @Override

     public InputStreamReader getReader() {

         InputStream stringStream = getInputStream(content);

         return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));

     }


     InputStream getInputStream(Content content) {

         //check which extract stream to use

         if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {

             return new EnglishOnlyStream(content);//optimal for english, english only

         } else {

             return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);

         }

     }


     @Override

     public void setExtractionSettings(Lookup context) {

         if (context != null) {

             StringsConfig configInstance = context.lookup(StringsConfig.class);

             if (configInstance == null) {

                 return;

             }

             if (Objects.nonNull(configInstance.getExtractUTF8())) {

                 extractUTF8 = configInstance.getExtractUTF8();

             }

             if (Objects.nonNull(configInstance.getExtractUTF16())) {

                 extractUTF16 = configInstance.getExtractUTF16();

             }

             if (Objects.nonNull(configInstance.getLanguageScripts())) {

                 setScripts(configInstance.getLanguageScripts());

             }

         }

     }


     @Override

     public boolean isSupported() {

         return extractUTF8 || extractUTF16;

     }


     private static class EnglishOnlyStream extends InputStream {


         private static final String NLS = Character.toString((char) 10); //new line

         private static final int READ_BUF_SIZE = 65536;

         private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string


         //args

         private final Content content;


         //internal working data

         private long contentOffset = 0; //offset in fscontent read into curReadBuf

         private final byte[] curReadBuf = new byte[READ_BUF_SIZE];

         private int bytesInReadBuf = 0;

         private int readBufOffset = 0; //offset in read buf processed

         private StringBuilder curString = new StringBuilder();

         private int curStringLen = 0;

         private StringBuilder tempString = new StringBuilder();

         private int tempStringLen = 0;

         private boolean isEOF = false;

         private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()

         private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()

         private boolean inString = false; //if current temp has min chars required

         private final byte[] oneCharBuf = new byte[1];


         private EnglishOnlyStream(Content content) {

             this.content = content;

         }


         @Override

         public int read(byte[] b, int off, int len) throws IOException {

             if (b == null) {

                 throw new NullPointerException();

             } else if (off < 0 || len < 0 || len > b.length - off) {

                 throw new IndexOutOfBoundsException();

             } else if (len == 0) {

                 return 0;

             }

             long fileSize = content.getSize();

             if (fileSize == 0) {

                 return -1;

             }

             if (isEOF) {

                 return -1;

             }

             if (stringAtTempBoundary) {

                 //append entire temp string residual from previous read()

                 //because qualified string was broken down into 2 parts

                 appendResetTemp();

                 stringAtTempBoundary = false;

                 //there could be more to this string in fscontent/buffer

             }

             boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char

             int newCurLen = curStringLen + tempStringLen;

             while (newCurLen < len) {

                 //need to extract more strings

                 if (readBufOffset > bytesInReadBuf - 1) {

                     //no more bytes to process into strings, read them

                     try {

                         bytesInReadBuf = 0;

                         bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);

                     } catch (TskException ex) {

                         if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {

                             appendResetTemp();

                             //have some extracted string, return that, and fail next time

                             isEOF = true;

                             int copied = copyToReturn(b, off, len);

                             return copied;

                         } else {

                             return -1; //EOF

                         }

                     }

                     if (bytesInReadBuf < 1) {

                         if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {

                             appendResetTemp();

                             //have some extracted string, return that, and fail next time

                             isEOF = true;

                             int copied = copyToReturn(b, off, len);

                             return copied;

                         } else {

                             return -1; //EOF

                         }

                     }

                     //increment content offset for next read

                     contentOffset += bytesInReadBuf;

                     //reset read buf position

                     readBufOffset = 0;

                 }

                 //get char from cur read buf

                 char c = (char) curReadBuf[readBufOffset++];

                 singleConsecZero = c == 0 && singleConsecZero == false; //preserve the current sequence if max consec. 1 zero char

                 if (StringExtract.isPrintableAscii(c)) {

                     tempString.append(c);

                     ++tempStringLen;

                     if (tempStringLen >= MIN_PRINTABLE_CHARS) {

                         inString = true;

                     }

                     //boundary case when temp has still chars - handled after the loop

                 } else if (!singleConsecZero) {

                     //break the string, clear temp

                     if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {

                         //append entire temp string with new line

                         tempString.append(NLS);

                         ++tempStringLen;

                         curString.append(tempString);

                         curStringLen += tempStringLen;

                         stringAtBufBoundary = false;

                     }

                     //reset temp

                     tempString = new StringBuilder();

                     tempStringLen = 0;

                 }

                 newCurLen = curStringLen + tempStringLen;

             }

             //check if still in string state, so that next chars in read buf bypass min chars check

             //and qualify as string even if less < min chars required

             if (inString) {

                 inString = false; //reset

                 stringAtBufBoundary = true; //will bypass the check

             }

             //check if temp still has chars to qualify as a string

             //we might need to break up temp into 2 parts for next read() call

             //consume as many as possible to fill entire user buffer

             if (tempStringLen >= MIN_PRINTABLE_CHARS) {

                 if (newCurLen > len) {

                     int appendChars = len - curStringLen;

                     //save part for next user read(), need to break up temp string

                     //do not append new line

                     String toAppend = tempString.substring(0, appendChars);

                     String newTemp = tempString.substring(appendChars);

                     curString.append(toAppend);

                     curStringLen += appendChars;

                     tempString = new StringBuilder(newTemp);

                     tempStringLen = newTemp.length();

                     stringAtTempBoundary = true;

                 } else {

                     //append entire temp

                     curString.append(tempString);

                     curStringLen += tempStringLen;

                     //reset temp

                     tempString = new StringBuilder();

                     tempStringLen = 0;

                 }

             } else {

                 //if temp has a few chars, not qualified as string for now,

                 //will be processed during next read() call

             }

             //copy current strings to user

             final int copied = copyToReturn(b, off, len);

             //there may be still chars in read buffer or  tempString, for next read()

             return copied;

         }


         //append temp buffer to cur string buffer and reset temp, if enough chars

         //does not append new line

         private void appendResetTemp() {

             if (tempStringLen >= MIN_PRINTABLE_CHARS) {

                 curString.append(tempString);

                 curStringLen += tempStringLen;

                 tempString = new StringBuilder();

                 tempStringLen = 0;

             }

         }


         //copy currently extracted string to user buffer

         //and reset for next read() call

         private int copyToReturn(byte[] b, int off, long len) {

             final String curStringS = curString.toString();

             //logger.log(Level.INFO, curStringS);

             byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));

             System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));

             //logger.log(Level.INFO, curStringS);

             //copied all string, reset

             curString = new StringBuilder();

             int ret = curStringLen;

             curStringLen = 0;

             return ret;

         }


         @Override

         public int read() throws IOException {

             final int read = read(oneCharBuf, 0, 1);

             if (read == 1) {

                 return oneCharBuf[0];

             } else {

                 return -1;

             }

         }


         @Override

         public int available() throws IOException {

             //we don't know how many bytes in curReadBuf may end up as strings

             return 0;

         }


         @Override

         public long skip(long n) throws IOException {

             //use default implementation that reads into skip buffer

             //but it could be more efficient

             return super.skip(n);

         }

     }


     private static class InternationalStream extends InputStream {


         private static final int FILE_BUF_SIZE = 1024 * 1024;

         private final Content content;

         private final byte[] oneCharBuf = new byte[1];

         private final StringExtract stringExtractor;

         private final boolean nothingToDo;

         private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];

         private long fileReadOffset = 0L;

         private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user

         private int convertBuffOffset = 0; //offset to start returning data to user on next read()

         private int bytesInConvertBuff = 0; //amount of data currently in the buffer

         private boolean fileEOF = false; //if file has more bytes to read

         private StringExtract.StringExtractResult lastExtractResult;


         private InternationalStream(Content content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {

             this.content = content;

             this.stringExtractor = new StringExtract();

             this.stringExtractor.setEnabledScripts(scripts);

             this.nothingToDo = extractUTF8 == false && extractUTF16 == false;

             this.stringExtractor.setEnableUTF8(extractUTF8);

             this.stringExtractor.setEnableUTF16(extractUTF16);

         }


         @Override

         public int read() throws IOException {

             if (nothingToDo) {

                 return -1;

             }

             final int read = read(oneCharBuf, 0, 1);

             if (read == 1) {

                 return oneCharBuf[0];

             } else {

                 return -1;

             }

         }


         @Override

         public int read(byte[] b, int off, int len) throws IOException {

             if (b == null) {

                 throw new NullPointerException();

             } else if (off < 0 || len < 0 || len > b.length - off) {

                 throw new IndexOutOfBoundsException();

             } else if (len == 0) {

                 return 0;

             }

             if (nothingToDo) {

                 return -1;

             }

             long fileSize = content.getSize();

             if (fileSize == 0) {

                 return -1;

             }

             //read and convert until user buffer full

             //we have data if file can be read or when byteBuff has converted strings to return

             int bytesToUser = 0; //returned to user so far

             int offsetUser = off;

             while (bytesToUser < len && offsetUser < len) {

                 //check if we have enough converted strings

                 int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;

                 if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {

                     try {

                         //convert more strings, store in buffer

                         long toRead = 0;


                         //fill up entire fileReadBuff fresh

                         toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);

                         //}

                         int read = content.read(fileReadBuff, fileReadOffset, toRead);

                         if (read == -1 || read == 0) {

                             fileEOF = true;

                         } else {

                             fileReadOffset += read;

                             if (fileReadOffset >= fileSize) {

                                 fileEOF = true;

                             }

                             //put converted string in convertBuff

                             convert(read);

                             convertBuffRemain = bytesInConvertBuff - convertBuffOffset;

                         }

                     } catch (TskCoreException ex) {

                         fileEOF = true;

                     }

                 }

                 //nothing more to read, and no more bytes in convertBuff

                 if (convertBuff == null || convertBuffRemain == 0) {

                     if (fileEOF) {

                         return bytesToUser > 0 ? bytesToUser : -1;

                     } else {

                         //no strings extracted, try another read

                         continue;

                     }

                 }

                 //return part or all of convert buff to user

                 final int toCopy = Math.min(convertBuffRemain, len - offsetUser);

                 System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);


                 convertBuffOffset += toCopy;

                 offsetUser += toCopy;

                 bytesToUser += toCopy;

             }

             //if more string data in convertBuff, will be consumed on next read()

             return bytesToUser;

         }


         private void convert(int numBytes) {

             lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);

             convertBuff = lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));

             //reset tracking vars

             if (lastExtractResult.getNumBytes() == 0) {

                 bytesInConvertBuff = 0;

             } else {

                 bytesInConvertBuff = convertBuff.length;

             }

             convertBuffOffset = 0;

         }

     }

 }

org.sleuthkit

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.fileEOF
boolean fileEOF
Definition: StringsTextExtractor.java:376

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.oneCharBuf
final byte[] oneCharBuf
Definition: StringsTextExtractor.java:168

org.sleuthkit.autopsy.coreutils.StringExtract
Definition: StringExtract.java:43

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.convertBuff
byte[] convertBuff
Definition: StringsTextExtractor.java:373

org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable
Definition: StringExtract.java:645

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.appendResetTemp
void appendResetTemp()
Definition: StringsTextExtractor.java:306

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.bytesInConvertBuff
int bytesInConvertBuff
Definition: StringsTextExtractor.java:375

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.inString
boolean inString
Definition: StringsTextExtractor.java:167

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.convert
void convert(int numBytes)
Definition: StringsTextExtractor.java:487

org

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream
Definition: StringsTextExtractor.java:360

org.sleuthkit.autopsy.coreutils.StringExtract.setEnableUTF16
void setEnableUTF16(boolean enableUTF16)
Definition: StringExtract.java:104

org.sleuthkit.autopsy.coreutils.StringExtract.extract
StringExtractResult extract(byte[] buff, int len, int offset)
Definition: StringExtract.java:181

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.EnglishOnlyStream
EnglishOnlyStream(Content content)
Definition: StringsTextExtractor.java:176

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.InternationalStream
InternationalStream(Content content, List< SCRIPT > scripts, boolean extractUTF8, boolean extractUTF16)
Definition: StringsTextExtractor.java:391

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.read
int read()
Definition: StringsTextExtractor.java:331

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.READ_BUF_SIZE
static final int READ_BUF_SIZE
Definition: StringsTextExtractor.java:149

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.NLS
static final String NLS
Definition: StringsTextExtractor.java:148

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream
Definition: StringsTextExtractor.java:146

org.sleuthkit.autopsy.coreutils
Definition: AppSQLiteDB.java:19

org.sleuthkit.autopsy.coreutils.StringExtract.setEnabledScripts
final void setEnabledScripts(List< SCRIPT > scripts)
Definition: StringExtract.java:113

org.sleuthkit.autopsy.coreutils.StringExtract.setEnableUTF8
void setEnableUTF8(boolean enableUTF8)
Definition: StringExtract.java:96

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.stringExtractor
final StringExtract stringExtractor
Definition: StringsTextExtractor.java:365

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.nothingToDo
final boolean nothingToDo
Definition: StringsTextExtractor.java:370

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.read
int read(byte[] b, int off, int len)
Definition: StringsTextExtractor.java:414

org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
Definition: StringExtract.java:655

org.sleuthkit.autopsy.coreutils.StringExtract.isPrintableAscii
static boolean isPrintableAscii(char c)
Definition: StringExtract.java:588

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.stringAtTempBoundary
boolean stringAtTempBoundary
Definition: StringsTextExtractor.java:165

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.isEOF
boolean isEOF
Definition: StringsTextExtractor.java:164

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.bytesInReadBuf
int bytesInReadBuf
Definition: StringsTextExtractor.java:158

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.tempStringLen
int tempStringLen
Definition: StringsTextExtractor.java:163

org.sleuthkit.autopsy.textextractors.configs.StringsConfig
Definition: StringsConfig.java:37

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.fileReadBuff
final byte[] fileReadBuff
Definition: StringsTextExtractor.java:371

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.curString
StringBuilder curString
Definition: StringsTextExtractor.java:160

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.available
int available()
Definition: StringsTextExtractor.java:341

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.FILE_BUF_SIZE
static final int FILE_BUF_SIZE
Definition: StringsTextExtractor.java:362

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.read
int read(byte[] b, int off, int len)
Definition: StringsTextExtractor.java:181

org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult
Definition: StringExtract.java:595

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.content
final Content content
Definition: StringsTextExtractor.java:153

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.lastExtractResult
StringExtract.StringExtractResult lastExtractResult
Definition: StringsTextExtractor.java:377

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.convertBuffOffset
int convertBuffOffset
Definition: StringsTextExtractor.java:374

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.stringAtBufBoundary
boolean stringAtBufBoundary
Definition: StringsTextExtractor.java:166

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.copyToReturn
int copyToReturn(byte[] b, int off, long len)
Definition: StringsTextExtractor.java:317

org.sleuthkit.autopsy

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.contentOffset
long contentOffset
Definition: StringsTextExtractor.java:156

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.read
int read()
Definition: StringsTextExtractor.java:401

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.fileReadOffset
long fileReadOffset
Definition: StringsTextExtractor.java:372

org.sleuthkit.autopsy.textextractors
Definition: ArtifactTextExtractor.java:19

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.curReadBuf
final byte[] curReadBuf
Definition: StringsTextExtractor.java:157

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.skip
long skip(long n)
Definition: StringsTextExtractor.java:347

org.sleuthkit.autopsy.textextractors.configs
Definition: ImageConfig.java:19

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.content
final Content content
Definition: StringsTextExtractor.java:363

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.tempString
StringBuilder tempString
Definition: StringsTextExtractor.java:162

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.MIN_PRINTABLE_CHARS
static final int MIN_PRINTABLE_CHARS
Definition: StringsTextExtractor.java:150

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.curStringLen
int curStringLen
Definition: StringsTextExtractor.java:161

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.EnglishOnlyStream.readBufOffset
int readBufOffset
Definition: StringsTextExtractor.java:159

org.sleuthkit.autopsy.textextractors.StringsTextExtractor.InternationalStream.oneCharBuf
final byte[] oneCharBuf
Definition: StringsTextExtractor.java:364