19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.text.Normalizer;
27 import java.util.Iterator;
28 import java.util.NoSuchElementException;
29 import javax.annotation.concurrent.NotThreadSafe;
41 class Chunker
implements Iterator<Chunk>, Iterable<Chunk> {
44 private static final Charset UTF_16 = StandardCharsets.UTF_16;
45 private static final Charset UTF_8 = StandardCharsets.UTF_8;
51 private static final int MAX_TOTAL_CHUNK_SIZE = 32760;
56 private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024;
61 private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024;
66 private static final int WHITE_SPACE_BUFFER_SIZE = 512;
70 private static final int READ_CHARS_BUFFER_SIZE = 512;
79 private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10;
86 private final PushbackReader reader;
90 private final char[] tempChunkBuf =
new char[READ_CHARS_BUFFER_SIZE];
95 private int chunkSizeBytes = 0;
102 private int lowerCasedChunkSizeBytes = 0;
107 private boolean endOfReaderReached =
false;
111 private Exception ex;
118 Chunker(Reader reader) {
120 this.reader =
new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
124 public Iterator<Chunk> iterator() {
134 boolean hasException() {
143 public Exception getException() {
148 public boolean hasNext() {
150 && (endOfReaderReached ==
false);
162 private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
163 final int length = sb.length();
164 for (
int i = 0; i < length; i++) {
165 if (TextUtil.isValidSolrUTF8(sb.charAt(i)) ==
false) {
166 sb.replace(i, i + 1,
"^");
181 private static StringBuilder replaceInvalidUTF16(String s) {
184 return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
194 static StringBuilder sanitize(String s) {
195 String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
196 return sanitizeToUTF8(replaceInvalidUTF16(normStr));
200 public Chunk next() {
201 if (hasNext() ==
false) {
202 throw new NoSuchElementException(
"There are no more chunks.");
207 lowerCasedChunkSizeBytes = 0;
208 int baseChunkSizeChars = 0;
209 StringBuilder currentChunk =
new StringBuilder();
210 StringBuilder currentWindow =
new StringBuilder();
211 StringBuilder lowerCasedChunk =
new StringBuilder();
214 readBaseChunk(currentChunk, lowerCasedChunk);
215 baseChunkSizeChars = currentChunk.length();
216 readWindow(currentWindow, lowerCasedChunk);
218 currentChunk.append(currentWindow);
219 if (endOfReaderReached) {
223 baseChunkSizeChars = currentChunk.length();
226 reader.unread(currentWindow.toString().toCharArray());
228 }
catch (Exception ioEx) {
235 return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
243 private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk)
throws IOException {
245 readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
248 readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
256 private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk)
throws IOException {
258 readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
261 readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
272 private void readHelper(
int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment)
throws IOException {
275 while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
276 && (endOfReaderReached ==
false)) {
277 charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
278 if (-1 == charsRead) {
280 endOfReaderReached =
true;
284 final char lastChar = tempChunkBuf[charsRead - 1];
285 if (Character.isHighSurrogate(lastChar)) {
287 reader.unread(lastChar);
291 StringBuilder chunkSegment = sanitize(
new String(tempChunkBuf, 0, charsRead));
294 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
298 String lowerCasedSegment = chunkSegment.toString().toLowerCase();
299 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
302 if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
304 currentSegment.append(chunkSegment);
305 chunkSizeBytes += segmentSize;
307 currentLowerCasedSegment.append(lowerCasedSegment);
308 lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
311 reader.unread(tempChunkBuf, 0, charsRead);
326 private void readToWhiteSpaceHelper(
int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk)
throws IOException {
328 boolean whitespaceFound =
false;
330 while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
331 && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
332 && (whitespaceFound ==
false)
333 && (endOfReaderReached ==
false)) {
334 charsRead = reader.read(tempChunkBuf, 0, 1);
335 if (-1 == charsRead) {
337 endOfReaderReached =
true;
341 final char ch = tempChunkBuf[0];
343 if (Character.isHighSurrogate(ch)) {
345 int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
346 charsRead += surrogateCharsRead;
347 if (surrogateCharsRead == -1) {
349 endOfReaderReached =
true;
353 chunkSegment =
new String(tempChunkBuf, 0, 2);
357 chunkSegment =
new String(tempChunkBuf, 0, 1);
361 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
363 int segmentSize = chunkSegment.getBytes(UTF_8).length;
367 String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
368 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
371 if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
372 && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
375 currentChunk.append(sanitizedChunkSegment);
376 chunkSizeBytes += segmentSize;
378 lowerCasedChunk.append(lowerCasedSegment);
379 lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
382 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
385 reader.unread(tempChunkBuf, 0, charsRead);
398 private final StringBuilder sb;
399 private final int baseChunkSizeChars;
400 private final StringBuilder lowerCasedChunk;
402 Chunk(StringBuilder sb,
int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
404 this.baseChunkSizeChars = baseChunkSizeChars;
405 this.lowerCasedChunk = lowerCasedChunk;
414 public String toString() {
415 return sb.toString();
423 public String geLowerCasedChunk() {
424 return lowerCasedChunk.toString();
432 int getBaseChunkLength() {
433 return baseChunkSizeChars;