19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.PushbackReader;
 
   23 import java.io.Reader;
 
   24 import java.nio.charset.Charset;
 
   25 import java.nio.charset.StandardCharsets;
 
   26 import java.util.Iterator;
 
   27 import java.util.NoSuchElementException;
 
   28 import javax.annotation.concurrent.NotThreadSafe;
 
   40 class Chunker 
implements Iterator<Chunk>, Iterable<Chunk> {
 
   43     private static final Charset UTF_16 = StandardCharsets.UTF_16;
 
   44     private static final Charset UTF_8 = StandardCharsets.UTF_8;
 
   48     private static final int MAX_TOTAL_CHUNK_SIZE = 32760; 
 
   51     private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; 
 
   54     private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; 
 
   57     private static final int WHITE_SPACE_BUFFER_SIZE = 512; 
 
   59     private static final int READ_CHARS_BUFFER_SIZE = 512; 
 
   64     private final PushbackReader reader;
 
   66     private final char[] tempChunkBuf = 
new char[READ_CHARS_BUFFER_SIZE];
 
   69     private int chunkSizeBytes = 0;
 
   72     private boolean endOfReaderReached = 
false;
 
   81     Chunker(Reader reader) {
 
   83         this.reader = 
new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
 
   87     public Iterator<Chunk> iterator() {
 
   97     boolean hasException() {
 
  106     public Exception getException() {
 
  111     public boolean hasNext() {
 
  113                 && (endOfReaderReached == 
false);
 
  125     private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
 
  126         final int length = sb.length();
 
  127         for (
int i = 0; i < length; i++) {
 
  128             if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == 
false) {
 
  129                 sb.replace(i, i + 1, 
"^");
 
  144     private static StringBuilder replaceInvalidUTF16(String s) {
 
  147         return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
 
  150     private static StringBuilder sanitize(String s) {
 
  151         return sanitizeToUTF8(replaceInvalidUTF16(s));
 
  155     public Chunk next() {
 
  156         if (hasNext() == 
false) {
 
  157             throw new NoSuchElementException(
"There are no more chunks.");
 
  162         int baseChunkSizeChars = 0;
 
  163         StringBuilder currentChunk = 
new StringBuilder();
 
  164         StringBuilder currentWindow = 
new StringBuilder();
 
  167             currentChunk.append(readBaseChunk());
 
  168             baseChunkSizeChars = currentChunk.length(); 
 
  169             currentWindow.append(readWindow());
 
  171         currentChunk.append(currentWindow);
 
  172             if (endOfReaderReached) {
 
  176                 baseChunkSizeChars = currentChunk.length();
 
  179                 reader.unread(currentWindow.toString().toCharArray());
 
  181         } 
catch (Exception ioEx) {
 
  188         return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
 
  196     private StringBuilder readBaseChunk() throws IOException {
 
  197         StringBuilder currentChunk = 
new StringBuilder();
 
  199         readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
 
  202         readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
 
  211     private StringBuilder readWindow() throws IOException {
 
  212         StringBuilder currentWindow = 
new StringBuilder();
 
  214         readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
 
  217         readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
 
  218         return currentWindow;
 
  229     private void readHelper(
int maxBytes, StringBuilder currentSegment) 
throws IOException {
 
  232         while ((chunkSizeBytes < maxBytes)
 
  233                 && (endOfReaderReached == 
false)) {
 
  234             charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
 
  235             if (-1 == charsRead) {
 
  237                 endOfReaderReached = 
true;
 
  241                 final char lastChar = tempChunkBuf[charsRead - 1];
 
  242                 if (Character.isHighSurrogate(lastChar)) {
 
  244                     reader.unread(lastChar);
 
  248                 StringBuilder chunkSegment = sanitize(
new String(tempChunkBuf, 0, charsRead));
 
  251                 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
 
  254                 if (chunkSizeBytes + segmentSize < maxBytes) {
 
  256                     currentSegment.append(chunkSegment);
 
  257                     chunkSizeBytes += segmentSize;
 
  260                     reader.unread(tempChunkBuf, 0, charsRead);
 
  275     private void readToWhiteSpaceHelper(
int maxBytes, StringBuilder currentChunk) 
throws IOException {
 
  277         boolean whitespaceFound = 
false;
 
  279         while ((chunkSizeBytes < maxBytes)
 
  280                 && (whitespaceFound == 
false)
 
  281                 && (endOfReaderReached == 
false)) {
 
  282             charsRead = reader.read(tempChunkBuf, 0, 1);
 
  283             if (-1 == charsRead) {
 
  285                 endOfReaderReached = 
true;
 
  289                 final char ch = tempChunkBuf[0];
 
  291                 if (Character.isHighSurrogate(ch)) {
 
  293                     charsRead = reader.read(tempChunkBuf, 1, 1);
 
  294                     if (charsRead == -1) {
 
  296                         endOfReaderReached = 
true;
 
  300                         chunkSegment = 
new String(tempChunkBuf, 0, 2);
 
  304                     chunkSegment = 
new String(tempChunkBuf, 0, 1);
 
  308                 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
 
  310                 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
 
  312                 currentChunk.append(sanitizedChunkSegment);
 
  313                 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
 
  324         private final StringBuilder sb;
 
  325         private final int baseChunkSizeChars;
 
  326         private final int chunkSizeBytes;
 
  328         Chunk(StringBuilder sb, 
int baseChunkSizeChars, 
int chunkSizeBytes) {
 
  330             this.baseChunkSizeChars = baseChunkSizeChars;
 
  331             this.chunkSizeBytes = chunkSizeBytes;
 
  340         public String toString() {
 
  341             return sb.toString();
 
  349         public int getChunkSizeBytes() {
 
  350             return chunkSizeBytes;
 
  358         int getBaseChunkLength() {
 
  359             return baseChunkSizeChars;