Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
Chunker.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.text.Normalizer;
27 import java.util.Iterator;
28 import java.util.NoSuchElementException;
29 import javax.annotation.concurrent.NotThreadSafe;
32 
40 @NotThreadSafe
41 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
42 
43  //local references to standard encodings
44  private static final Charset UTF_16 = StandardCharsets.UTF_16;
45  private static final Charset UTF_8 = StandardCharsets.UTF_8;
46 
47  //Chunking algorithm paramaters-------------------------------------//
51  private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
56  private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
61  private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
66  private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
70  private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
79  private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes
80 
82 
86  private final PushbackReader reader;
90  private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
91 
95  private int chunkSizeBytes = 0;
96 
102  private int lowerCasedChunkSizeBytes = 0;
107  private boolean endOfReaderReached = false;
111  private Exception ex;
112 
118  Chunker(Reader reader) {
119  //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
120  this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
121  }
122 
123  @Override
124  public Iterator<Chunk> iterator() {
125  return this;
126  }
127 
134  boolean hasException() {
135  return ex != null;
136  }
137 
143  public Exception getException() {
144  return ex;
145  }
146 
147  @Override
148  public boolean hasNext() {
149  return (ex == null)
150  && (endOfReaderReached == false);
151  }
152 
162  private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
163  final int length = sb.length();
164  for (int i = 0; i < length; i++) {
165  if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
166  sb.replace(i, i + 1, "^");
167  }
168  }
169  return sb;
170  }
171 
181  private static StringBuilder replaceInvalidUTF16(String s) {
182  /* encode the string to UTF-16 which does the replcement, see
183  * Charset.encode(), then decode back to a StringBuilder. */
184  return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
185  }
186 
194  static StringBuilder sanitize(String s) {
195  String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);
196  return sanitizeToUTF8(replaceInvalidUTF16(normStr));
197  }
198 
199  @Override
200  public Chunk next() {
201  if (hasNext() == false) {
202  throw new NoSuchElementException("There are no more chunks.");
203  }
204  //reset state for the next chunk
205 
206  chunkSizeBytes = 0;
207  lowerCasedChunkSizeBytes = 0;
208  int baseChunkSizeChars = 0;
209  StringBuilder currentChunk = new StringBuilder();
210  StringBuilder currentWindow = new StringBuilder();
211  StringBuilder lowerCasedChunk = new StringBuilder();
212 
213  try {
214  readBaseChunk(currentChunk, lowerCasedChunk);
215  baseChunkSizeChars = currentChunk.length(); //save the base chunk length
216  readWindow(currentWindow, lowerCasedChunk);
217  //add the window text to the current chunk.
218  currentChunk.append(currentWindow);
219  if (endOfReaderReached) {
220  /* if we have reached the end of the content,we won't make
221  * another overlapping chunk, so the length of the base chunk
222  * can be extended to the end. */
223  baseChunkSizeChars = currentChunk.length();
224  } else {
225  /* otherwise we will make another chunk, so unread the window */
226  reader.unread(currentWindow.toString().toCharArray());
227  }
228  } catch (Exception ioEx) {
229  /* Save the exception, which will cause hasNext() to return false,
230  * and break any chunking loop in client code. */
231  ex = ioEx;
232  }
233 
234  //sanitize the text and return a Chunk object, that includes the base chunk length.
235  return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);
236  }
237 
243  private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
244  //read the chunk until the minimum base chunk size
245  readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
246 
247  //keep reading until the maximum base chunk size or white space is reached.
248  readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);
249  }
250 
256  private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
257  //read the window, leaving some room to look for white space to break at.
258  readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);
259 
260  //keep reading until the max chunk size, or until whitespace is reached.
261  readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);
262  }
263 
272  private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException {
273  int charsRead = 0;
274  //read chars up to maxBytes, or the end of the reader.
275  while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)
276  && (endOfReaderReached == false)) {
277  charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
278  if (-1 == charsRead) {
279  //this is the last chunk
280  endOfReaderReached = true;
281  return;
282  } else {
283  //if the last char might be part of a surroate pair, unread it.
284  final char lastChar = tempChunkBuf[charsRead - 1];
285  if (Character.isHighSurrogate(lastChar)) {
286  charsRead--;
287  reader.unread(lastChar);
288  }
289 
290  //cleanup any invalid utf-16 sequences
291  StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));
292 
293  //get the length in utf8 bytes of the read chars
294  int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
295 
296  // lower case the string and get it's size. NOTE: lower casing can
297  // change the size of the string!
298  String lowerCasedSegment = chunkSegment.toString().toLowerCase();
299  int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
300 
301  //if it will not put us past maxBytes
302  if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {
303  //add it to the chunk
304  currentSegment.append(chunkSegment);
305  chunkSizeBytes += segmentSize;
306 
307  currentLowerCasedSegment.append(lowerCasedSegment);
308  lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
309  } else {
310  //unread it, and break out of read loop.
311  reader.unread(tempChunkBuf, 0, charsRead);
312  return;
313  }
314  }
315  }
316  }
317 
326  private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {
327  int charsRead = 0;
328  boolean whitespaceFound = false;
329  //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
330  while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
331  && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
332  && (whitespaceFound == false)
333  && (endOfReaderReached == false)) {
334  charsRead = reader.read(tempChunkBuf, 0, 1);
335  if (-1 == charsRead) {
336  //this is the last chunk
337  endOfReaderReached = true;
338  return;
339  } else {
340  //if the last charcter might be part of a surroate pair, read another char
341  final char ch = tempChunkBuf[0];
342  String chunkSegment;
343  if (Character.isHighSurrogate(ch)) {
344  //read another char into the buffer.
345  int surrogateCharsRead = reader.read(tempChunkBuf, 1, 1);
346  charsRead += surrogateCharsRead;
347  if (surrogateCharsRead == -1) {
348  //this is the last chunk, so just drop the unpaired surrogate
349  endOfReaderReached = true;
350  return;
351  } else {
352  //try to use the pair together.
353  chunkSegment = new String(tempChunkBuf, 0, 2);
354  }
355  } else {
356  //one char
357  chunkSegment = new String(tempChunkBuf, 0, 1);
358  }
359 
360  //cleanup any invalid utf-16 sequences
361  StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
362  //get the length in utf8 bytes of the read chars
363  int segmentSize = chunkSegment.getBytes(UTF_8).length;
364 
365  // lower case the string and get it's size. NOTE: lower casing can
366  // change the size of the string.
367  String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();
368  int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;
369 
370  //if it will not put us past maxBytes
371  if ((chunkSizeBytes + segmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)
372  && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)) {
373 
374  //add read chars to the chunk and update the length.
375  currentChunk.append(sanitizedChunkSegment);
376  chunkSizeBytes += segmentSize;
377 
378  lowerCasedChunk.append(lowerCasedSegment);
379  lowerCasedChunkSizeBytes += lowerCasedSegmentSize;
380 
381  //check for whitespace.
382  whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
383  } else {
384  //unread it, and break out of read loop.
385  reader.unread(tempChunkBuf, 0, charsRead);
386  return;
387  }
388  }
389  }
390  }
391 
396  static class Chunk {
397 
398  private final StringBuilder sb;
399  private final int baseChunkSizeChars;
400  private final StringBuilder lowerCasedChunk;
401  private boolean hasHit = false;
402  private int chunkId = 0;
403 
404  Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) {
405  this.sb = sb;
406  this.baseChunkSizeChars = baseChunkSizeChars;
407  this.lowerCasedChunk = lowerCasedChunk;
408  }
409 
415  @Override
416  public String toString() {
417  return sb.toString();
418  }
419 
425  public String getLowerCasedChunk() {
426  return lowerCasedChunk.toString();
427  }
428 
434  int getBaseChunkLength() {
435  return baseChunkSizeChars;
436  }
437 
438  boolean hasHit() {
439  return hasHit;
440  }
441 
442  void setHasHit(boolean b) {
443  hasHit = b;
444  }
445 
446  void setChunkId(int id) {
447  chunkId = id;
448  }
449 
450  int getChunkId() {
451  return chunkId;
452  }
453  }
454 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Feb 6 2024
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.