19 package org.sleuthkit.autopsy.textextractors;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.Objects;
28 import org.openide.util.Lookup;
39 final class StringsTextExtractor
implements TextExtractor {
41 private boolean extractUTF8;
42 private boolean extractUTF16;
43 private final Content content;
44 private final static String DEFAULT_INDEXED_TEXT_CHARSET =
"UTF-8";
46 private final List<SCRIPT> extractScripts =
new ArrayList<>();
53 public StringsTextExtractor(Content content) {
55 extractScripts.add(SCRIPT.LATIN_2);
57 this.content = content;
65 public final void setScripts(List<SCRIPT> extractScripts) {
66 if (extractScripts == null) {
70 this.extractScripts.clear();
71 this.extractScripts.addAll(extractScripts);
83 public InputStreamReader getReader() {
84 InputStream stringStream = getInputStream(content);
85 return new InputStreamReader(stringStream, Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
88 InputStream getInputStream(Content content) {
90 if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
91 return new EnglishOnlyStream(content);
93 return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
107 public void setExtractionSettings(Lookup context) {
108 if (context != null) {
109 StringsConfig configInstance = context.lookup(StringsConfig.class);
110 if (configInstance == null) {
113 if (Objects.nonNull(configInstance.getExtractUTF8())) {
114 extractUTF8 = configInstance.getExtractUTF8();
116 if (Objects.nonNull(configInstance.getExtractUTF16())) {
117 extractUTF16 = configInstance.getExtractUTF16();
119 if (Objects.nonNull(configInstance.getLanguageScripts())) {
120 setScripts(configInstance.getLanguageScripts());
130 public boolean isSupported() {
131 return extractUTF8 || extractUTF16;
148 private static final String
NLS = Character.toString((
char) 10);
181 public int read(byte[] b,
int off,
int len)
throws IOException {
183 throw new NullPointerException();
184 }
else if (off < 0 || len < 0 || len > b.length - off) {
185 throw new IndexOutOfBoundsException();
186 }
else if (len == 0) {
189 long fileSize = content.getSize();
196 if (stringAtTempBoundary) {
200 stringAtTempBoundary =
false;
203 boolean singleConsecZero =
false;
205 while (newCurLen < len) {
207 if (readBufOffset > bytesInReadBuf - 1) {
211 bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
212 }
catch (TskException ex) {
213 if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
223 if (bytesInReadBuf < 1) {
224 if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
240 char c = (char) curReadBuf[readBufOffset++];
241 singleConsecZero = c == 0 && singleConsecZero ==
false;
243 tempString.append(c);
245 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
249 }
else if (!singleConsecZero) {
251 if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
253 tempString.append(NLS);
255 curString.append(tempString);
257 stringAtBufBoundary =
false;
260 tempString =
new StringBuilder();
269 stringAtBufBoundary =
true;
274 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
275 if (newCurLen > len) {
279 String toAppend = tempString.substring(0, appendChars);
280 String newTemp = tempString.substring(appendChars);
281 curString.append(toAppend);
282 curStringLen += appendChars;
283 tempString =
new StringBuilder(newTemp);
284 tempStringLen = newTemp.length();
285 stringAtTempBoundary =
true;
288 curString.append(tempString);
291 tempString =
new StringBuilder();
307 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
308 curString.append(tempString);
310 tempString =
new StringBuilder();
318 final String curStringS = curString.toString();
320 byte[] stringBytes = curStringS.getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
321 System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (
int) len));
324 curString =
new StringBuilder();
331 public int read() throws IOException {
332 final int read =
read(oneCharBuf, 0, 1);
334 return oneCharBuf[0];
347 public long skip(
long n)
throws IOException {
350 return super.skip(n);
391 private InternationalStream(Content content, List<SCRIPT> scripts,
boolean extractUTF8,
boolean extractUTF16) {
395 this.nothingToDo = extractUTF8 ==
false && extractUTF16 ==
false;
401 public int read() throws IOException {
405 final int read =
read(oneCharBuf, 0, 1);
407 return oneCharBuf[0];
414 public int read(byte[] b,
int off,
int len)
throws IOException {
416 throw new NullPointerException();
417 }
else if (off < 0 || len < 0 || len > b.length - off) {
418 throw new IndexOutOfBoundsException();
419 }
else if (len == 0) {
425 long fileSize = content.getSize();
432 int offsetUser = off;
433 while (bytesToUser < len && offsetUser < len) {
436 if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
442 toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
444 int read = content.read(fileReadBuff, fileReadOffset, toRead);
445 if (read == -1 || read == 0) {
448 fileReadOffset +=
read;
449 if (fileReadOffset >= fileSize) {
456 }
catch (TskCoreException ex) {
461 if (convertBuff == null || convertBuffRemain == 0) {
463 return bytesToUser > 0 ? bytesToUser : -1;
470 final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
471 System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
473 convertBuffOffset += toCopy;
474 offsetUser += toCopy;
475 bytesToUser += toCopy;
489 convertBuff =
lastExtractResult.getText().getBytes(Charset.forName(DEFAULT_INDEXED_TEXT_CHARSET));
492 bytesInConvertBuff = 0;
494 bytesInConvertBuff = convertBuff.length;
496 convertBuffOffset = 0;