19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.text.Normalizer;
27 import java.util.Iterator;
28 import java.util.NoSuchElementException;
29 import javax.annotation.concurrent.NotThreadSafe;
41 class Chunker
implements Iterator<Chunk>, Iterable<Chunk> {
44 private static final Charset UTF_16 = StandardCharsets.UTF_16;
45 private static final Charset UTF_8 = StandardCharsets.UTF_8;
51 private static final int MAX_TOTAL_CHUNK_SIZE = 32760;
56 private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024;
61 private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024;
66 private static final int WHITE_SPACE_BUFFER_SIZE = 512;
70 private static final int READ_CHARS_BUFFER_SIZE = 512;
79 private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10;
86 private final PushbackReader reader;
90 private final char[] tempChunkBuf =
new char[READ_CHARS_BUFFER_SIZE];
95 private int chunkSizeBytes = 0;
102 private int lowerCasedChunkSizeBytes = 0;
107 private boolean endOfReaderReached =
false;
111 private Exception ex;
118 Chunker(Reader reader) {
120 this.reader =
new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
124 public Iterator<Chunk> iterator() {
134 boolean hasException() {
143 public Exception getException() {
148 public boolean hasNext() {
150 && (endOfReaderReached ==
false);
162 private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
163 final int length = sb.length();
164 for (
int i = 0; i < length; i++) {
165 if (TextUtil.isValidSolrUTF8(sb.charAt(i)) ==
false) {
166 sb.replace(i, i + 1,
"^");
181 private static StringBuilder replaceInvalidUTF16(String s) {
184 return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
187 private static StringBuilder sanitize(String s) {
188 String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
189 return sanitizeToUTF8(replaceInvalidUTF16(normStr));
194 public Chunk next() {
195 if (hasNext() ==
false) {
196 throw new NoSuchElementException(
"There are no more chunks.");
201 lowerCasedChunkSizeBytes = 0;
202 int baseChunkSizeChars = 0;
203 StringBuilder currentChunk =
new StringBuilder();
204 StringBuilder currentWindow =
new StringBuilder();
205 StringBuilder lowerCasedChunk =
new StringBuilder();
208 readBaseChunk(currentChunk, lowerCasedChunk);
209 baseChunkSizeChars = currentChunk.length();
210 readWindow(currentWindow, lowerCasedChunk);
212 currentChunk.append(currentWindow);
213 if (endOfReaderReached) {
217 baseChunkSizeChars = currentChunk.length();
220 reader.unread(currentWindow.toString().toCharArray());
222 }
catch (Exception ioEx) {
229 return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
237 private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk)
throws IOException {
239 readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
242 readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
250 private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk)
throws IOException {
252 readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
255 readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
266 private void readHelper(
int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment)
throws IOException {
269 while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
270 && (endOfReaderReached ==
false)) {
271 charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
272 if (-1 == charsRead) {
274 endOfReaderReached =
true;
278 final char lastChar = tempChunkBuf[charsRead - 1];
279 if (Character.isHighSurrogate(lastChar)) {
281 reader.unread(lastChar);
285 StringBuilder chunkSegment = sanitize(
new String(tempChunkBuf, 0, charsRead));
288 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
292 String lowerCasedSegment = chunkSegment.toString().toLowerCase();
293 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
296 if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
298 currentSegment.append(chunkSegment);
299 chunkSizeBytes += segmentSize;
301 currentLowerCasedSegment.append(lowerCasedSegment);
302 lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
305 reader.unread(tempChunkBuf, 0, charsRead);
320 private void readToWhiteSpaceHelper(
int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk)
throws IOException {
322 boolean whitespaceFound =
false;
324 while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
325 && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
326 && (whitespaceFound ==
false)
327 && (endOfReaderReached ==
false)) {
328 charsRead = reader.read(tempChunkBuf, 0, 1);
329 if (-1 == charsRead) {
331 endOfReaderReached =
true;
335 final char ch = tempChunkBuf[0];
337 if (Character.isHighSurrogate(ch)) {
339 charsRead = reader.read(tempChunkBuf, 1, 1);
340 if (charsRead == -1) {
342 endOfReaderReached =
true;
346 chunkSegment =
new String(tempChunkBuf, 0, 2);
350 chunkSegment =
new String(tempChunkBuf, 0, 1);
354 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
356 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
358 currentChunk.append(sanitizedChunkSegment);
359 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
363 String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
364 lowerCasedChunk.append(lowerCasedSegment);
365 lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;
376 private final StringBuilder sb;
377 private final int baseChunkSizeChars;
378 private final StringBuilder lowerCasedChunk;
380 Chunk(StringBuilder sb,
int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
382 this.baseChunkSizeChars = baseChunkSizeChars;
383 this.lowerCasedChunk = lowerCasedChunk;
392 public String toString() {
393 return sb.toString();
401 public String geLowerCasedChunk() {
402 return lowerCasedChunk.toString();
410 int getBaseChunkLength() {
411 return baseChunkSizeChars;