api-docs/4.21.0/_chunker_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2011-2018 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import java.io.IOException;

 import java.io.PushbackReader;

 import java.io.Reader;

 import java.nio.charset.Charset;

 import java.nio.charset.StandardCharsets;

 import java.text.Normalizer;

 import java.util.Iterator;

 import java.util.NoSuchElementException;

 import javax.annotation.concurrent.NotThreadSafe;

 import org.sleuthkit.autopsy.coreutils.TextUtil;

 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;


 @NotThreadSafe

 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {


     //local references to standard encodings

     private static final Charset UTF_16 = StandardCharsets.UTF_16;

     private static final Charset UTF_8 = StandardCharsets.UTF_8;


     //Chunking algorithm paramaters-------------------------------------//

     private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes

     private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes

     private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes

     private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes

     private static final int READ_CHARS_BUFFER_SIZE = 512; //chars

     private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes


     private final PushbackReader reader;

     private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];


     private int chunkSizeBytes = 0;


     private int lowerCasedChunkSizeBytes = 0;

     private boolean endOfReaderReached = false;

     private Exception ex;


     Chunker(Reader reader) {

         //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.

         this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);

     }


     @Override

     public Iterator<Chunk> iterator() {

         return this;

     }


     boolean hasException() {

         return ex != null;

     }


     public Exception getException() {

         return ex;

     }


     @Override

     public boolean hasNext() {

         return (ex == null)

                 && (endOfReaderReached == false);

     }


     private static StringBuilder sanitizeToUTF8(StringBuilder sb) {

         final int length = sb.length();

         for (int i = 0; i < length; i++) {

             if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {

                 sb.replace(i, i + 1, "^");

             }

         }

         return sb;

     }


     private static StringBuilder replaceInvalidUTF16(String s) {

         /* encode the string to UTF-16 which does the replcement, see

          * Charset.encode(), then decode back to a StringBuilder. */

         return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));

     }


     static StringBuilder sanitize(String s) {

         String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);

         return sanitizeToUTF8(replaceInvalidUTF16(normStr));

     }


     @Override

     public Chunk next() {

         if (hasNext() == false) {

             throw new NoSuchElementException("There are no more chunks.");

         }

         //reset state for the next chunk


         chunkSizeBytes = 0;

         lowerCasedChunkSizeBytes = 0;

         int baseChunkSizeChars = 0;

         StringBuilder currentChunk = new StringBuilder();

         StringBuilder currentWindow = new StringBuilder();

         StringBuilder lowerCasedChunk = new StringBuilder();


         try {

             readBaseChunk(currentChunk, lowerCasedChunk);

             baseChunkSizeChars = currentChunk.length(); //save the base chunk length

             readWindow(currentWindow, lowerCasedChunk);

             //add the window text to the current chunk.

             currentChunk.append(currentWindow);

             if (endOfReaderReached) {

                 /* if we have reached the end of the content,we won't make

                  * another overlapping chunk, so the length of the base chunk

                  * can be extended to the end. */

                 baseChunkSizeChars = currentChunk.length();

             } else {

                 /* otherwise we will make another chunk, so unread the window */

                 reader.unread(currentWindow.toString().toCharArray());

             }

         } catch (Exception ioEx) {

             /* Save the exception, which will cause hasNext() to return false,

              * and break any chunking loop in client code. */

             ex = ioEx;

         }


         //sanitize the text and return a Chunk object, that includes the base chunk length.

         return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);

     }


     private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {

         //read the chunk until the minimum base chunk size

         readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);


         //keep reading until the maximum base chunk size or white space is reached.

         readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);

     }


     private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {

         //read the window, leaving some room to look for white space to break at.

         readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);


         //keep reading until the max chunk size, or until whitespace is reached.

         readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);

     }


     private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException {

         int charsRead = 0;

         //read chars up to maxBytes, or the end of the reader.

         while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)

                 && (endOfReaderReached == false)) {

             charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);

             if (-1 == charsRead) {

                 //this is the last chunk

                 endOfReaderReached = true;

                 return;

             } else {

                 //if the last char might be part of a surroate pair, unread it.

                 final char lastChar = tempChunkBuf[charsRead - 1];

                 if (Character.isHighSurrogate(lastChar)) {

                     charsRead--;

                     reader.unread(lastChar);

                 }


                 //cleanup any invalid utf-16 sequences

                 StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));


                 //get the length in utf8 bytes of the read chars

                 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;


                 // lower case the string and get it's size. NOTE: lower casing can

                 // change the size of the string!

                 String lowerCasedSegment = chunkSegment.toString().toLowerCase();

                 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;


                 //if it will not put us past maxBytes

                 if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {

                     //add it to the chunk

                     currentSegment.append(chunkSegment);

                     chunkSizeBytes += segmentSize;


                     currentLowerCasedSegment.append(lowerCasedSegment);

                     lowerCasedChunkSizeBytes += lowerCasedSegmentSize;

                 } else {

                     //unread it, and break out of read loop.

                     reader.unread(tempChunkBuf, 0, charsRead);

                     return;

                 }

             }

         }

     }


     private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {

         int charsRead = 0;

         boolean whitespaceFound = false;

         //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.

         while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)

                 && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)

                 && (whitespaceFound == false)

                 && (endOfReaderReached == false)) {

             charsRead = reader.read(tempChunkBuf, 0, 1);

             if (-1 == charsRead) {

                 //this is the last chunk

                 endOfReaderReached = true;

                 return;

             } else {

                 //if the last charcter might be part of a surroate pair, read another char

                 final char ch = tempChunkBuf[0];

                 String chunkSegment;

                 if (Character.isHighSurrogate(ch)) {

                     //read another char into the buffer.

                     int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);

                     charsRead += surrogateCharsRead;

                     if (surrogateCharsRead == -1) {

                         //this is the last chunk, so just drop the unpaired surrogate

                         endOfReaderReached = true;

                         return;

                     } else {

                         //try to use the pair together.

                         chunkSegment = new String(tempChunkBuf, 0, 2);

                     }

                 } else {

                     //one char

                     chunkSegment = new String(tempChunkBuf, 0, 1);

                 }


                 //cleanup any invalid utf-16 sequences

                 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);

                 //get the length in utf8 bytes of the read chars

                 int segmentSize = chunkSegment.getBytes(UTF_8).length;


                 // lower case the string and get it's size. NOTE: lower casing can

                 // change the size of the string.

                 String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();

                 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;


                 //if it will not put us past maxBytes

                 if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)

                         && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {


                     //add read chars to the chunk and update the length.

                     currentChunk.append(sanitizedChunkSegment);

                     chunkSizeBytes += segmentSize;


                     lowerCasedChunk.append(lowerCasedSegment);

                     lowerCasedChunkSizeBytes += lowerCasedSegmentSize;


                     //check for whitespace.

                     whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));

                 } else {

                     //unread it, and break out of read loop.

                     reader.unread(tempChunkBuf, 0, charsRead);

                     return;

                 }

             }

         }

     }


     static class Chunk {


         private final StringBuilder sb;

         private final int baseChunkSizeChars;

         private final StringBuilder lowerCasedChunk;

         private boolean hasHit = false;

         private int chunkId = 0;


         Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) {

             this.sb = sb;

             this.baseChunkSizeChars = baseChunkSizeChars;

             this.lowerCasedChunk = lowerCasedChunk;

         }


         @Override

         public String toString() {

             return sb.toString();

         }


         public String getLowerCasedChunk() {

             return lowerCasedChunk.toString();

         }


         int getBaseChunkLength() {

             return baseChunkSizeChars;

         }


         boolean hasHit() {

             return hasHit;

         }


         void setHasHit(boolean b) {

             hasHit = b;

         }


         void setChunkId(int id) {

             chunkId = id;

         }


         int getChunkId() {

             return chunkId;

         }

     }

 }

org.sleuthkit

org

org.sleuthkit.autopsy.coreutils
Definition: AppSQLiteDB.java:19

org.sleuthkit.autopsy.keywordsearch.Chunker
Definition: Chunker.java:41

org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk
Definition: Chunker.java:396

org.sleuthkit.autopsy.coreutils.TextUtil
Definition: TextUtil.java:26

org.sleuthkit.autopsy

org.sleuthkit.autopsy.keywordsearch
Definition: AccountsText.java:19