Autopsy  4.20.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringsTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.Objects;
28 import org.openide.util.Lookup;
32 import org.sleuthkit.datamodel.Content;
33 import org.sleuthkit.datamodel.TskCoreException;
34 import org.sleuthkit.datamodel.TskException;
35 
39 final class StringsTextExtractor implements TextExtractor {
40 
41  private boolean extractUTF8;
42  private boolean extractUTF16;
43  private final Content content;
44  private final static String DEFAULT_INDEXED_TEXT_CHARSET = "UTF-8";
45 
46  private final List<SCRIPT> extractScripts = new ArrayList<>();
47 
53  public StringsTextExtractor(Content content) {
54  //LATIN_2 is the default script
55  extractScripts.add(SCRIPT.LATIN_2);
56  extractUTF8 = true;
57  this.content = content;
58  }
59 
65  public final void setScripts(List<SCRIPT> extractScripts) {
66  if (extractScripts == null) {
67  return;
68  }
69 
70  this.extractScripts.clear();
71  this.extractScripts.addAll(extractScripts);
72  }
73 
82  @Override
83  public InputStreamReader getReader() {
84  InputStream stringStream = getInputStream(content);
85  return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
86  }
87 
88  InputStream getInputStream(Content content) {
89  //check which extract stream to use
90  if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
91  return new EnglishOnlyStream(content);//optimal for english, english only
92  } else {
93  return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
94  }
95  }
96 
106  @Override
107  public void setExtractionSettings(Lookup context) {
108  if (context != null) {
109  StringsConfig configInstance = context.lookup(StringsConfig.class);
110  if (configInstance == null) {
111  return;
112  }
113  if (Objects.nonNull(configInstance.getExtractUTF8())) {
114  extractUTF8 = configInstance.getExtractUTF8();
115  }
116  if (Objects.nonNull(configInstance.getExtractUTF16())) {
117  extractUTF16 = configInstance.getExtractUTF16();
118  }
119  if (Objects.nonNull(configInstance.getLanguageScripts())) {
120  setScripts(configInstance.getLanguageScripts());
121  }
122  }
123  }
124 
129  @Override
130  public boolean isSupported() {
131  return extractUTF8 || extractUTF16;
132  }
133 
146  private static class EnglishOnlyStream extends InputStream {
147 
148  private static final String NLS = Character.toString((char) 10); //new line
149  private static final int READ_BUF_SIZE = 65536;
150  private static final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string
151 
152  //args
153  private final Content content;
154 
155  //internal working data
156  private long contentOffset = 0; //offset in fscontent read into curReadBuf
157  private final byte[] curReadBuf = new byte[READ_BUF_SIZE];
158  private int bytesInReadBuf = 0;
159  private int readBufOffset = 0; //offset in read buf processed
160  private StringBuilder curString = new StringBuilder();
161  private int curStringLen = 0;
162  private StringBuilder tempString = new StringBuilder();
163  private int tempStringLen = 0;
164  private boolean isEOF = false;
165  private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read()
166  private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read()
167  private boolean inString = false; //if current temp has min chars required
168  private final byte[] oneCharBuf = new byte[1];
169 
176  private EnglishOnlyStream(Content content) {
177  this.content = content;
178  }
179 
180  @Override
181  public int read(byte[] b, int off, int len) throws IOException {
182  if (b == null) {
183  throw new NullPointerException();
184  } else if (off < 0 || len < 0 || len > b.length - off) {
185  throw new IndexOutOfBoundsException();
186  } else if (len == 0) {
187  return 0;
188  }
189  long fileSize = content.getSize();
190  if (fileSize == 0) {
191  return -1;
192  }
193  if (isEOF) {
194  return -1;
195  }
196  if (stringAtTempBoundary) {
197  //append entire temp string residual from previous read()
198  //because qualified string was broken down into 2 parts
199  appendResetTemp();
200  stringAtTempBoundary = false;
201  //there could be more to this string in fscontent/buffer
202  }
203  boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
204  int newCurLen = curStringLen + tempStringLen;
205  while (newCurLen < len) {
206  //need to extract more strings
207  if (readBufOffset > bytesInReadBuf - 1) {
208  //no more bytes to process into strings, read them
209  try {
210  bytesInReadBuf = 0;
211  bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
212  } catch (TskException ex) {
213  if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
214  appendResetTemp();
215  //have some extracted string, return that, and fail next time
216  isEOF = true;
217  int copied = copyToReturn(b, off, len);
218  return copied;
219  } else {
220  return -1; //EOF
221  }
222  }
223  if (bytesInReadBuf < 1) {
224  if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
225  appendResetTemp();
226  //have some extracted string, return that, and fail next time
227  isEOF = true;
228  int copied = copyToReturn(b, off, len);
229  return copied;
230  } else {
231  return -1; //EOF
232  }
233  }
234  //increment content offset for next read
235  contentOffset += bytesInReadBuf;
236  //reset read buf position
237  readBufOffset = 0;
238  }
239  //get char from cur read buf
240  char c = (char) curReadBuf[readBufOffset++];
241  singleConsecZero = c == 0 && singleConsecZero == false; //preserve the current sequence if max consec. 1 zero char
243  tempString.append(c);
244  ++tempStringLen;
245  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
246  inString = true;
247  }
248  //boundary case when temp has still chars - handled after the loop
249  } else if (!singleConsecZero) {
250  //break the string, clear temp
251  if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
252  //append entire temp string with new line
253  tempString.append(NLS);
254  ++tempStringLen;
255  curString.append(tempString);
256  curStringLen += tempStringLen;
257  stringAtBufBoundary = false;
258  }
259  //reset temp
260  tempString = new StringBuilder();
261  tempStringLen = 0;
262  }
263  newCurLen = curStringLen + tempStringLen;
264  }
265  //check if still in string state, so that next chars in read buf bypass min chars check
266  //and qualify as string even if less < min chars required
267  if (inString) {
268  inString = false; //reset
269  stringAtBufBoundary = true; //will bypass the check
270  }
271  //check if temp still has chars to qualify as a string
272  //we might need to break up temp into 2 parts for next read() call
273  //consume as many as possible to fill entire user buffer
274  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
275  if (newCurLen > len) {
276  int appendChars = len - curStringLen;
277  //save part for next user read(), need to break up temp string
278  //do not append new line
279  String toAppend = tempString.substring(0, appendChars);
280  String newTemp = tempString.substring(appendChars);
281  curString.append(toAppend);
282  curStringLen += appendChars;
283  tempString = new StringBuilder(newTemp);
284  tempStringLen = newTemp.length();
285  stringAtTempBoundary = true;
286  } else {
287  //append entire temp
288  curString.append(tempString);
289  curStringLen += tempStringLen;
290  //reset temp
291  tempString = new StringBuilder();
292  tempStringLen = 0;
293  }
294  } else {
295  //if temp has a few chars, not qualified as string for now,
296  //will be processed during next read() call
297  }
298  //copy current strings to user
299  final int copied = copyToReturn(b, off, len);
300  //there may be still chars in read buffer or tempString, for next read()
301  return copied;
302  }
303 
304  //append temp buffer to cur string buffer and reset temp, if enough chars
305  //does not append new line
306  private void appendResetTemp() {
307  if (tempStringLen >= MIN_PRINTABLE_CHARS) {
308  curString.append(tempString);
309  curStringLen += tempStringLen;
310  tempString = new StringBuilder();
311  tempStringLen = 0;
312  }
313  }
314 
315  //copy currently extracted string to user buffer
316  //and reset for next read() call
317  private int copyToReturn(byte[] b, int off, long len) {
318  final String curStringS = curString.toString();
319  //logger.log(Level.INFO, curStringS);
320  byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
321  System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len));
322  //logger.log(Level.INFO, curStringS);
323  //copied all string, reset
324  curString = new StringBuilder();
325  int ret = curStringLen;
326  curStringLen = 0;
327  return ret;
328  }
329 
330  @Override
331  public int read() throws IOException {
332  final int read = read(oneCharBuf, 0, 1);
333  if (read == 1) {
334  return oneCharBuf[0];
335  } else {
336  return -1;
337  }
338  }
339 
340  @Override
341  public int available() throws IOException {
342  //we don't know how many bytes in curReadBuf may end up as strings
343  return 0;
344  }
345 
346  @Override
347  public long skip(long n) throws IOException {
348  //use default implementation that reads into skip buffer
349  //but it could be more efficient
350  return super.skip(n);
351  }
352  }
353 
360  private static class InternationalStream extends InputStream {
361 
362  private static final int FILE_BUF_SIZE = 1024 * 1024;
363  private final Content content;
364  private final byte[] oneCharBuf = new byte[1];
370  private final boolean nothingToDo;
371  private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
372  private long fileReadOffset = 0L;
373  private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
374  private int convertBuffOffset = 0; //offset to start returning data to user on next read()
375  private int bytesInConvertBuff = 0; //amount of data currently in the buffer
376  private boolean fileEOF = false; //if file has more bytes to read
378 
391  private InternationalStream(Content content, List<SCRIPT> scripts, boolean extractUTF8, boolean extractUTF16) {
392  this.content = content;
393  this.stringExtractor = new StringExtract();
394  this.stringExtractor.setEnabledScripts(scripts);
395  this.nothingToDo = extractUTF8 == false && extractUTF16 == false;
396  this.stringExtractor.setEnableUTF8(extractUTF8);
397  this.stringExtractor.setEnableUTF16(extractUTF16);
398  }
399 
400  @Override
401  public int read() throws IOException {
402  if (nothingToDo) {
403  return -1;
404  }
405  final int read = read(oneCharBuf, 0, 1);
406  if (read == 1) {
407  return oneCharBuf[0];
408  } else {
409  return -1;
410  }
411  }
412 
413  @Override
414  public int read(byte[] b, int off, int len) throws IOException {
415  if (b == null) {
416  throw new NullPointerException();
417  } else if (off < 0 || len < 0 || len > b.length - off) {
418  throw new IndexOutOfBoundsException();
419  } else if (len == 0) {
420  return 0;
421  }
422  if (nothingToDo) {
423  return -1;
424  }
425  long fileSize = content.getSize();
426  if (fileSize == 0) {
427  return -1;
428  }
429  //read and convert until user buffer full
430  //we have data if file can be read or when byteBuff has converted strings to return
431  int bytesToUser = 0; //returned to user so far
432  int offsetUser = off;
433  while (bytesToUser < len && offsetUser < len) {
434  //check if we have enough converted strings
435  int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
436  if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
437  try {
438  //convert more strings, store in buffer
439  long toRead = 0;
440 
441  //fill up entire fileReadBuff fresh
442  toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
443  //}
444  int read = content.read(fileReadBuff, fileReadOffset, toRead);
445  if (read == -1 || read == 0) {
446  fileEOF = true;
447  } else {
448  fileReadOffset += read;
449  if (fileReadOffset >= fileSize) {
450  fileEOF = true;
451  }
452  //put converted string in convertBuff
453  convert(read);
454  convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
455  }
456  } catch (TskCoreException ex) {
457  fileEOF = true;
458  }
459  }
460  //nothing more to read, and no more bytes in convertBuff
461  if (convertBuff == null || convertBuffRemain == 0) {
462  if (fileEOF) {
463  return bytesToUser > 0 ? bytesToUser : -1;
464  } else {
465  //no strings extracted, try another read
466  continue;
467  }
468  }
469  //return part or all of convert buff to user
470  final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
471  System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
472 
473  convertBuffOffset += toCopy;
474  offsetUser += toCopy;
475  bytesToUser += toCopy;
476  }
477  //if more string data in convertBuff, will be consumed on next read()
478  return bytesToUser;
479  }
480 
487  private void convert(int numBytes) {
488  lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
489  convertBuff = lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
490  //reset tracking vars
491  if (lastExtractResult.getNumBytes() == 0) {
492  bytesInConvertBuff = 0;
493  } else {
494  bytesInConvertBuff = convertBuff.length;
495  }
496  convertBuffOffset = 0;
497  }
498  }
499 }
StringExtractResult extract(byte[] buff, int len, int offset)
InternationalStream(Content content, List< SCRIPT > scripts, boolean extractUTF8, boolean extractUTF16)
final void setEnabledScripts(List< SCRIPT > scripts)

Copyright © 2012-2022 Basis Technology. Generated on: Tue Aug 1 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.