19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.PushbackReader;
 
   23 import java.io.Reader;
 
   24 import java.nio.charset.Charset;
 
   25 import java.nio.charset.StandardCharsets;
 
   26 import java.text.Normalizer;
 
   27 import java.util.Iterator;
 
   28 import java.util.NoSuchElementException;
 
   29 import javax.annotation.concurrent.NotThreadSafe;
 
   41 class Chunker 
implements Iterator<Chunk>, Iterable<Chunk> {
 
   44     private static final Charset UTF_16 = StandardCharsets.UTF_16;
 
   45     private static final Charset UTF_8 = StandardCharsets.UTF_8;
 
   51     private static final int MAX_TOTAL_CHUNK_SIZE = 32760; 
 
   56     private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; 
 
   61     private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; 
 
   66     private static final int WHITE_SPACE_BUFFER_SIZE = 512; 
 
   70     private static final int READ_CHARS_BUFFER_SIZE = 512; 
 
   79     private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; 
 
   86     private final PushbackReader reader;
 
   90     private final char[] tempChunkBuf = 
new char[READ_CHARS_BUFFER_SIZE];
 
   95     private int chunkSizeBytes = 0;
 
  102     private int lowerCasedChunkSizeBytes = 0;
 
  107     private boolean endOfReaderReached = 
false;
 
  111     private Exception ex;
 
  118     Chunker(Reader reader) {
 
  120         this.reader = 
new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
 
  124     public Iterator<Chunk> iterator() {
 
  134     boolean hasException() {
 
  143     public Exception getException() {
 
  148     public boolean hasNext() {
 
  150                 && (endOfReaderReached == 
false);
 
  162     private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
 
  163         final int length = sb.length();
 
  164         for (
int i = 0; i < length; i++) {
 
  165             if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == 
false) {
 
  166                 sb.replace(i, i + 1, 
"^");
 
  181     private static StringBuilder replaceInvalidUTF16(String s) {
 
  184         return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
 
  194     static StringBuilder sanitize(String s) {
 
  195         String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
 
  196         return sanitizeToUTF8(replaceInvalidUTF16(normStr));
 
  200     public Chunk next() {
 
  201         if (hasNext() == 
false) {
 
  202             throw new NoSuchElementException(
"There are no more chunks.");
 
  207         lowerCasedChunkSizeBytes = 0;
 
  208         int baseChunkSizeChars = 0;
 
  209         StringBuilder currentChunk = 
new StringBuilder();
 
  210         StringBuilder currentWindow = 
new StringBuilder();
 
  211         StringBuilder lowerCasedChunk = 
new StringBuilder();
 
  214             readBaseChunk(currentChunk, lowerCasedChunk);
 
  215             baseChunkSizeChars = currentChunk.length(); 
 
  216             readWindow(currentWindow, lowerCasedChunk);
 
  218             currentChunk.append(currentWindow);
 
  219             if (endOfReaderReached) {
 
  223                 baseChunkSizeChars = currentChunk.length();
 
  226                 reader.unread(currentWindow.toString().toCharArray());
 
  228         } 
catch (Exception ioEx) {
 
  235         return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
 
  243     private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) 
throws IOException {
 
  245         readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
 
  248         readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
 
  256     private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) 
throws IOException {
 
  258         readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
 
  261         readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
 
  272     private void readHelper(
int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) 
throws IOException {
 
  275         while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
 
  276                 && (endOfReaderReached == 
false)) {
 
  277             charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
 
  278             if (-1 == charsRead) {
 
  280                 endOfReaderReached = 
true;
 
  284                 final char lastChar = tempChunkBuf[charsRead - 1];
 
  285                 if (Character.isHighSurrogate(lastChar)) {
 
  287                     reader.unread(lastChar);
 
  291                 StringBuilder chunkSegment = sanitize(
new String(tempChunkBuf, 0, charsRead));
 
  294                 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
 
  298                 String lowerCasedSegment = chunkSegment.toString().toLowerCase();
 
  299                 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
 
  302                 if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
 
  304                     currentSegment.append(chunkSegment);
 
  305                     chunkSizeBytes += segmentSize;
 
  307                     currentLowerCasedSegment.append(lowerCasedSegment);
 
  308                     lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
 
  311                     reader.unread(tempChunkBuf, 0, charsRead);
 
  326     private void readToWhiteSpaceHelper(
int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) 
throws IOException {
 
  328         boolean whitespaceFound = 
false;
 
  330         while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES) 
 
  331                 && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
 
  332                 && (whitespaceFound == 
false)
 
  333                 && (endOfReaderReached == 
false)) {
 
  334             charsRead = reader.read(tempChunkBuf, 0, 1);
 
  335             if (-1 == charsRead) {
 
  337                 endOfReaderReached = 
true;
 
  341                 final char ch = tempChunkBuf[0];
 
  343                 if (Character.isHighSurrogate(ch)) {
 
  345                     int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
 
  346                     charsRead += surrogateCharsRead;
 
  347                     if (surrogateCharsRead == -1) {
 
  349                         endOfReaderReached = 
true;
 
  353                         chunkSegment = 
new String(tempChunkBuf, 0, 2);
 
  357                     chunkSegment = 
new String(tempChunkBuf, 0, 1);
 
  361                 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
 
  363                 int segmentSize = chunkSegment.getBytes(UTF_8).length;
 
  367                 String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
 
  368                 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
 
  371                 if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
 
  372                         && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
 
  375                     currentChunk.append(sanitizedChunkSegment);
 
  376                     chunkSizeBytes += segmentSize;
 
  378                     lowerCasedChunk.append(lowerCasedSegment);
 
  379                     lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
 
  382                     whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
 
  385                     reader.unread(tempChunkBuf, 0, charsRead);
 
  398         private final StringBuilder sb;
 
  399         private final int baseChunkSizeChars;
 
  400         private final StringBuilder lowerCasedChunk;
 
  402         Chunk(StringBuilder sb, 
int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
 
  404             this.baseChunkSizeChars = baseChunkSizeChars;
 
  405             this.lowerCasedChunk = lowerCasedChunk;
 
  414         public String toString() {
 
  415             return sb.toString();
 
  423         public String geLowerCasedChunk() {
 
  424             return lowerCasedChunk.toString();
 
  432         int getBaseChunkLength() {
 
  433             return baseChunkSizeChars;