19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.text.Normalizer;
27 import java.util.Iterator;
28 import java.util.NoSuchElementException;
29 import javax.annotation.concurrent.NotThreadSafe;
41 class Chunker
implements Iterator<Chunk>, Iterable<Chunk> {
44 private static final Charset UTF_16 = StandardCharsets.UTF_16;
45 private static final Charset UTF_8 = StandardCharsets.UTF_8;
49 private static final int MAX_TOTAL_CHUNK_SIZE = 32760;
52 private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024;
55 private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024;
58 private static final int WHITE_SPACE_BUFFER_SIZE = 512;
60 private static final int READ_CHARS_BUFFER_SIZE = 512;
65 private final PushbackReader reader;
67 private final char[] tempChunkBuf =
new char[READ_CHARS_BUFFER_SIZE];
70 private int chunkSizeBytes = 0;
73 private boolean endOfReaderReached =
false;
82 Chunker(Reader reader) {
84 this.reader =
new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
88 public Iterator<Chunk> iterator() {
98 boolean hasException() {
107 public Exception getException() {
112 public boolean hasNext() {
114 && (endOfReaderReached ==
false);
126 private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
127 final int length = sb.length();
128 for (
int i = 0; i < length; i++) {
129 if (TextUtil.isValidSolrUTF8(sb.charAt(i)) ==
false) {
130 sb.replace(i, i + 1,
"^");
145 private static StringBuilder replaceInvalidUTF16(String s) {
148 return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
151 private static StringBuilder sanitize(String s) {
152 String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
153 return sanitizeToUTF8(replaceInvalidUTF16(normStr));
158 public Chunk next() {
159 if (hasNext() ==
false) {
160 throw new NoSuchElementException(
"There are no more chunks.");
165 int baseChunkSizeChars = 0;
166 StringBuilder currentChunk =
new StringBuilder();
167 StringBuilder currentWindow =
new StringBuilder();
170 currentChunk.append(readBaseChunk());
171 baseChunkSizeChars = currentChunk.length();
172 currentWindow.append(readWindow());
174 currentChunk.append(currentWindow);
175 if (endOfReaderReached) {
179 baseChunkSizeChars = currentChunk.length();
182 reader.unread(currentWindow.toString().toCharArray());
184 }
catch (Exception ioEx) {
191 return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
199 private StringBuilder readBaseChunk() throws IOException {
200 StringBuilder currentChunk =
new StringBuilder();
202 readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
205 readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
214 private StringBuilder readWindow() throws IOException {
215 StringBuilder currentWindow =
new StringBuilder();
217 readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
220 readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
221 return currentWindow;
232 private void readHelper(
int maxBytes, StringBuilder currentSegment)
throws IOException {
235 while ((chunkSizeBytes < maxBytes)
236 && (endOfReaderReached ==
false)) {
237 charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
238 if (-1 == charsRead) {
240 endOfReaderReached =
true;
244 final char lastChar = tempChunkBuf[charsRead - 1];
245 if (Character.isHighSurrogate(lastChar)) {
247 reader.unread(lastChar);
251 StringBuilder chunkSegment = sanitize(
new String(tempChunkBuf, 0, charsRead));
254 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
257 if (chunkSizeBytes + segmentSize < maxBytes) {
259 currentSegment.append(chunkSegment);
260 chunkSizeBytes += segmentSize;
263 reader.unread(tempChunkBuf, 0, charsRead);
278 private void readToWhiteSpaceHelper(
int maxBytes, StringBuilder currentChunk)
throws IOException {
280 boolean whitespaceFound =
false;
282 while ((chunkSizeBytes < maxBytes)
283 && (whitespaceFound ==
false)
284 && (endOfReaderReached ==
false)) {
285 charsRead = reader.read(tempChunkBuf, 0, 1);
286 if (-1 == charsRead) {
288 endOfReaderReached =
true;
292 final char ch = tempChunkBuf[0];
294 if (Character.isHighSurrogate(ch)) {
296 charsRead = reader.read(tempChunkBuf, 1, 1);
297 if (charsRead == -1) {
299 endOfReaderReached =
true;
303 chunkSegment =
new String(tempChunkBuf, 0, 2);
307 chunkSegment =
new String(tempChunkBuf, 0, 1);
311 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
313 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
315 currentChunk.append(sanitizedChunkSegment);
316 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
327 private final StringBuilder sb;
328 private final int baseChunkSizeChars;
329 private final int chunkSizeBytes;
331 Chunk(StringBuilder sb,
int baseChunkSizeChars,
int chunkSizeBytes) {
333 this.baseChunkSizeChars = baseChunkSizeChars;
334 this.chunkSizeBytes = chunkSizeBytes;
343 public String toString() {
344 return sb.toString();
352 public int getChunkSizeBytes() {
353 return chunkSizeBytes;
361 int getBaseChunkLength() {
362 return baseChunkSizeChars;