19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
28 import java.util.logging.Level;
39 class StringsTextExtractor
extends ContentTextExtractor {
41 static final private Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
51 private final List<SCRIPT> extractScripts =
new ArrayList<>();
52 private Map<String, String> extractOptions =
new HashMap<>();
54 public StringsTextExtractor() {
56 extractScripts.add(SCRIPT.LATIN_2);
64 public void setScripts(List<SCRIPT> extractScripts) {
65 this.extractScripts.clear();
66 this.extractScripts.addAll(extractScripts);
74 public List<SCRIPT> getScripts() {
75 return new ArrayList<>(extractScripts);
84 public Map<String, String> getOptions() {
85 return extractOptions;
93 public void setOptions(Map<String, String> options) {
94 this.extractOptions = options;
98 public void logWarning(
final String msg, Exception ex) {
99 logger.log(Level.WARNING, msg, ex);
103 public boolean isDisabled() {
104 boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
105 boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
107 return extractUTF8 ==
false && extractUTF16 ==
false;
111 public InputStreamReader getReader(Content content)
throws TextExtractorException {
112 InputStream stringStream = getInputStream(content);
113 return new InputStreamReader(stringStream, Server.DEFAULT_INDEXED_TEXT_CHARSET);
116 InputStream getInputStream(Content content) {
118 if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
119 return new EnglishOnlyStream(content);
121 boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF8.toString()));
122 boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(ExtractOptions.EXTRACT_UTF16.toString()));
124 return new InternationalStream(content, extractScripts, extractUTF8, extractUTF16);
129 public boolean isContentTypeSpecific() {
134 public boolean isSupported(Content content, String detectedFormat) {
154 private static final String
NLS = Character.toString((
char) 10);
187 public int read(byte[] b,
int off,
int len)
throws IOException {
189 throw new NullPointerException();
190 }
else if (off < 0 || len < 0 || len > b.length - off) {
191 throw new IndexOutOfBoundsException();
192 }
else if (len == 0) {
195 long fileSize = content.getSize();
202 if (stringAtTempBoundary) {
206 stringAtTempBoundary =
false;
209 boolean singleConsecZero =
false;
211 while (newCurLen < len) {
213 if (readBufOffset > bytesInReadBuf - 1) {
217 bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE);
218 }
catch (TskException ex) {
219 if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
229 if (bytesInReadBuf < 1) {
230 if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) {
246 char c = (char) curReadBuf[readBufOffset++];
247 if (c == 0 && singleConsecZero ==
false) {
249 singleConsecZero =
true;
251 singleConsecZero =
false;
254 tempString.append(c);
256 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
260 }
else if (!singleConsecZero) {
262 if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) {
264 tempString.append(NLS);
266 curString.append(tempString);
268 stringAtBufBoundary =
false;
271 tempString =
new StringBuilder();
280 stringAtBufBoundary =
true;
285 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
286 if (newCurLen > len) {
290 String toAppend = tempString.substring(0, appendChars);
291 String newTemp = tempString.substring(appendChars);
292 curString.append(toAppend);
293 curStringLen += appendChars;
294 tempString =
new StringBuilder(newTemp);
295 tempStringLen = newTemp.length();
296 stringAtTempBoundary =
true;
299 curString.append(tempString);
302 tempString =
new StringBuilder();
318 if (tempStringLen >= MIN_PRINTABLE_CHARS) {
319 curString.append(tempString);
321 tempString =
new StringBuilder();
329 final String curStringS = curString.toString();
332 System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (
int) len));
335 curString =
new StringBuilder();
342 public int read() throws IOException {
343 final int read =
read(oneCharBuf, 0, 1);
345 return oneCharBuf[0];
358 public long skip(
long n)
throws IOException {
361 return super.skip(n);
403 private InternationalStream(Content content, List<SCRIPT> scripts,
boolean extractUTF8,
boolean extractUTF16) {
407 this.nothingToDo = extractUTF8 ==
false && extractUTF16 ==
false;
413 public int read() throws IOException {
417 final int read =
read(oneCharBuf, 0, 1);
419 return oneCharBuf[0];
426 public int read(byte[] b,
int off,
int len)
throws IOException {
428 throw new NullPointerException();
429 }
else if (off < 0 || len < 0 || len > b.length - off) {
430 throw new IndexOutOfBoundsException();
431 }
else if (len == 0) {
437 long fileSize = content.getSize();
444 int offsetUser = off;
445 while (bytesToUser < len && offsetUser < len) {
448 if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
454 toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
456 int read = content.read(fileReadBuff, fileReadOffset, toRead);
457 if (read == -1 || read == 0) {
460 fileReadOffset +=
read;
461 if (fileReadOffset >= fileSize) {
468 }
catch (TskCoreException ex) {
474 if (convertBuff == null || convertBuffRemain == 0) {
476 return bytesToUser > 0 ? bytesToUser : -1;
483 final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
484 System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
486 convertBuffOffset += toCopy;
487 offsetUser += toCopy;
488 bytesToUser += toCopy;
505 bytesInConvertBuff = 0;
507 bytesInConvertBuff = convertBuff.length;
509 convertBuffOffset = 0;
static final Charset DEFAULT_INDEXED_TEXT_CHARSET
default Charset to index text as
synchronized static Logger getLogger(String name)