Autopsy  4.9.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringsTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textreaders;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.Objects;
28 import org.openide.util.Lookup;
32 import org.sleuthkit.datamodel.Content;
33 import org.sleuthkit.datamodel.TskCoreException;
34 import org.sleuthkit.datamodel.TskException;
35 
39 final class StringsTextExtractor {
40 
41  private boolean extractUTF8;
42  private boolean extractUTF16;
43  private final Content content;
44  private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
45 
46  private final List<SCRIPT> extractScripts = new ArrayList<>();
47 
53  public StringsTextExtractor(Content content) {
54  //LATIN_2 is the default script
55  extractScripts.add(SCRIPT.LATIN_2);
56  extractUTF8 = true;
57  this.content = content;
58  }
59 
65  public final void setScripts(List<SCRIPT> extractScripts) {
66  if (extractScripts == null) {
67  return;
68  }
69 
70  this.extractScripts.clear();
71  this.extractScripts.addAll(extractScripts);
72  }
73 
84  public InputStreamReader getReader() {
85  InputStream stringStream = getInputStream(content);
86  return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
87  }
88 
89  InputStream getInputStream(Content content) {
90  //check which extract stream to use
91  if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
92  return new EnglishOnlyStream(content);//optimal for english, english only
93  } else {
94  return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
95  }
96  }
97 
107  public void setExtractionSettings(Lookup context) {
108  if (context != null) {
109  StringsConfig configInstance = context.lookup(StringsConfig.class);
110  if (configInstance == null) {
111  return;
112  }
113  if (Objects.nonNull(configInstance.getExtractUTF8())) {
114  extractUTF8 = configInstance.getExtractUTF8();
115  }
116  if (Objects.nonNull(configInstance.getExtractUTF16())) {
117  extractUTF16 = configInstance.getExtractUTF16();
118  }
119  if (Objects.nonNull(configInstance.getLanguageScripts())) {
120  setScripts(configInstance.getLanguageScripts());
121  }
122  }
123  }
124 
129  public boolean isEnabled() {
130  return extractUTF8 || extractUTF16;
131  }
132 
133  boolean isSupported(Content file, String detectedFormat) {
134  throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
135  }
136 
149  private static class EnglishOnlyStream extends InputStream {
150 
151  private static final String NLS = Character.toString((char) 10); //new line
152  private static final int READ_BUF_SIZE = 65536;
153  private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
154 
155  //args
156  private final Content content;
157 
158  //internal working data
159  private long contentOffset = 0; //offset in fscontent read into curReadBuf
160  private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
161  private int bytesInReadBuf = 0;
162  private int readBufOffset = 0; //offset in read buf processed
163  private StringBuilder curString = new StringBuilder();
164  private int curStringLen = 0;
165  private StringBuilder tempString = new StringBuilder();
166  private int tempStringLen = 0;
167  private boolean isEOF = false;
168  private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
169  private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
170  private boolean inString = false; //if current temp has min chars required
171  private final byte[] oneCharBuf = new byte[1];
172 
179  private EnglishOnlyStream(Content content) {
180  this.content = content;
181  }
182 
183  @Override
184  public int read(byte[] b, int off, int len) throws IOException {
185  if (b == null) {
186  throw new NullPointerException();
187  } else if (off < 0 || len < 0 || len > b.length - off) {
188  throw new IndexOutOfBoundsException();
189  } else if (len == 0) {
190  return 0;
191  }
192  long fileSize = content.getSize();
193  if (fileSize == 0) {
194  return -1;
195  }
196  if (isEOF) {
197  return -1;
198  }
199  if (stringAtTempBoundary) {
200  //append entire temp string residual from previous read()
201  //because qualified string was broken down into 2 parts
202  appendResetTemp();
203  stringAtTempBoundary = false;
204  //there could be more to this string in fscontent/buffer
205  }
206  boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
207  int newCurLen = curStringLen + tempStringLen;
208  while (newCurLen < len) {
209  //need to extract more strings
210  if (readBufOffset > bytesInReadBuf - 1) {
211  //no more bytes to process into strings, read them
212  try {
213  bytesInReadBuf = 0;
214  bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
215  } catch (TskException ex) {
216  if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
217  appendResetTemp();
218  //have some extracted string, return that, and fail next time
219  isEOF = true;
220  int copied = copyToReturn(b, off, len);
221  return copied;
222  } else {
223  return -1; //EOF
224  }
225  }
226  if (bytesInReadBuf < 1) {
227  if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
228  appendResetTemp();
229  //have some extracted string, return that, and fail next time
230  isEOF = true;
231  int copied = copyToReturn(b, off, len);
232  return copied;
233  } else {
234  return -1; //EOF
235  }
236  }
237  //increment content offset for next read
238  contentOffset += bytesInReadBuf;
239  //reset read buf position
240  readBufOffset = 0;
241  }
242  //get char from cur read buf
243  char c = (char) curReadBuf[readBufOffset++];
244  singleConsecZero = c == 0 && singleConsecZero == false; //preserve the current sequence if max consec. 1 zero char
246  tempString.append(c);
247  ++tempStringLen;
248  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
249  inString = true;
250  }
251  //boundary case when temp has still chars - handled after the loop
252  } else if (!singleConsecZero) {
253  //break the string, clear temp
254  if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
255  //append entire temp string with new line
256  tempString.append(NLS);
257  ++tempStringLen;
258  curString.append(tempString);
259  curStringLen += tempStringLen;
260  stringAtBufBoundary = false;
261  }
262  //reset temp
263  tempString = new StringBuilder();
264  tempStringLen = 0;
265  }
266  newCurLen = curStringLen + tempStringLen;
267  }
268  //check if still in string state, so that next chars in read buf bypass min chars check
269  //and qualify as string even if less < min chars required
270  if (inString) {
271  inString = false; //reset
272  stringAtBufBoundary = true; //will bypass the check
273  }
274  //check if temp still has chars to qualify as a string
275  //we might need to break up temp into 2 parts for next read() call
276  //consume as many as possible to fill entire user buffer
277  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
278  if (newCurLen > len) {
279  int appendChars = len - curStringLen;
280  //save part for next user read(), need to break up temp string
281  //do not append new line
282  String toAppend = tempString.substring(0, appendChars);
283  String newTemp = tempString.substring(appendChars);
284  curString.append(toAppend);
285  curStringLen += appendChars;
286  tempString = new StringBuilder(newTemp);
287  tempStringLen = newTemp.length();
288  stringAtTempBoundary = true;
289  } else {
290  //append entire temp
291  curString.append(tempString);
292  curStringLen += tempStringLen;
293  //reset temp
294  tempString = new StringBuilder();
295  tempStringLen = 0;
296  }
297  } else {
298  //if temp has a few chars, not qualified as string for now,
299  //will be processed during next read() call
300  }
301  //copy current strings to user
302  final int copied = copyToReturn(b, off, len);
303  //there may be still chars in read buffer or tempString, for next read()
304  return copied;
305  }
306 
307  //append temp buffer to cur string buffer and reset temp, if enough chars
308  //does not append new line
309  private void appendResetTemp() {
310  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
311  curString.append(tempString);
312  curStringLen += tempStringLen;
313  tempString = new StringBuilder();
314  tempStringLen = 0;
315  }
316  }
317 
318  //copy currently extracted string to user buffer
319  //and reset for next read() call
320  private int copyToReturn(byte[] b, int off, long len) {
321  final String curStringS = curString.toString();
322  //logger.log(Level.INFO, curStringS);
323  byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
324  System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
325  //logger.log(Level.INFO, curStringS);
326  //copied all string, reset
327  curString = new StringBuilder();
328  int ret = curStringLen;
329  curStringLen = 0;
330  return ret;
331  }
332 
333  @Override
334  public int read() throws IOException {
335  final int read = read(oneCharBuf, 0, 1);
336  if (read == 1) {
337  return oneCharBuf[0];
338  } else {
339  return -1;
340  }
341  }
342 
343  @Override
344  public int available() throws IOException {
345  //we don't know how many bytes in curReadBuf may end up as strings
346  return 0;
347  }
348 
349  @Override
350  public long skip(long n) throws IOException {
351  //use default implementation that reads into skip buffer
352  //but it could be more efficient
353  return super.skip(n);
354  }
355  }
356 
363  private static class InternationalStream extends InputStream {
364 
365  private static final int FILE_BUF_SIZE = 1024 * 1024;
366  private final Content content;
367  private final byte[] oneCharBuf = new byte[1];
373  private final boolean nothingToDo;
374  private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
375  private long fileReadOffset = 0L;
376  private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
377  private int convertBuffOffset = 0; //offset to start returning data to user on next read()
378  private int bytesInConvertBuff = 0; //amount of data currently in the buffer
379  private boolean fileEOF = false; //if file has more bytes to read
381 
394  private InternationalStream(Content content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
395  this.content = content;
396  this.stringExtractor = new StringExtract();
397  this.stringExtractor.setEnabledScripts(scripts);
398  this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
399  this.stringExtractor.setEnableUTF8(extractUTF8);
400  this.stringExtractor.setEnableUTF16(extractUTF16);
401  }
402 
403  @Override
404  public int read() throws IOException {
405  if (nothingToDo) {
406  return -1;
407  }
408  final int read = read(oneCharBuf, 0, 1);
409  if (read == 1) {
410  return oneCharBuf[0];
411  } else {
412  return -1;
413  }
414  }
415 
416  @Override
417  public int read(byte[] b, int off, int len) throws IOException {
418  if (b == null) {
419  throw new NullPointerException();
420  } else if (off < 0 || len < 0 || len > b.length - off) {
421  throw new IndexOutOfBoundsException();
422  } else if (len == 0) {
423  return 0;
424  }
425  if (nothingToDo) {
426  return -1;
427  }
428  long fileSize = content.getSize();
429  if (fileSize == 0) {
430  return -1;
431  }
432  //read and convert until user buffer full
433  //we have data if file can be read or when byteBuff has converted strings to return
434  int bytesToUser = 0; //returned to user so far
435  int offsetUser = off;
436  while (bytesToUser < len && offsetUser < len) {
437  //check if we have enough converted strings
438  int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
439  if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
440  try {
441  //convert more strings, store in buffer
442  long toRead = 0;
443 
444  //fill up entire fileReadBuff fresh
445  toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
446  //}
447  int read = content.read(fileReadBuff, fileReadOffset, toRead);
448  if (read == -1 || read == 0) {
449  fileEOF = true;
450  } else {
451  fileReadOffset += read;
452  if (fileReadOffset >= fileSize) {
453  fileEOF = true;
454  }
455  //put converted string in convertBuff
456  convert(read);
457  convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
458  }
459  } catch (TskCoreException ex) {
460  //Exceptions.printStackTrace(ex);
461  fileEOF = true;
462  }
463  }
464  //nothing more to read, and no more bytes in convertBuff
465  if (convertBuff == null || convertBuffRemain == 0) {
466  if (fileEOF) {
467  return bytesToUser > 0 ? bytesToUser : -1;
468  } else {
469  //no strings extracted, try another read
470  continue;
471  }
472  }
473  //return part or all of convert buff to user
474  final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
475  System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
476 
477  convertBuffOffset += toCopy;
478  offsetUser += toCopy;
479  bytesToUser += toCopy;
480  }
481  //if more string data in convertBuff, will be consumed on next read()
482  return bytesToUser;
483  }
484 
491  private void convert(int numBytes) {
492  lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
493  convertBuff = lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
494  //reset tracking vars
495  if (lastExtractResult.getNumBytes() == 0) {
496  bytesInConvertBuff = 0;
497  } else {
498  bytesInConvertBuff = convertBuff.length;
499  }
500  convertBuffOffset = 0;
501  }
502  }
503 }
StringExtractResult extract(byte[] buff, int len, int offset)
final void setEnabledScripts(List< SCRIPT > scripts)
InternationalStream(Content content, List< SCRIPT > scripts, boolean extractUTF8, boolean extractUTF16)

Copyright © 2012-2018 Basis Technology. Generated on: Tue Dec 18 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.