19 package org.sleuthkit.autopsy.keywordsearch;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.BufferedInputStream;
23 import java.io.Reader;
24 import org.apache.tika.parser.txt.CharsetDetector;
25 import org.apache.tika.parser.txt.CharsetMatch;
32 final class TextFileExtractor {
39 static final private int MIN_MATCH_CONFIDENCE = 20;
41 public Reader getReader(AbstractFile source)
throws TextFileExtractorException {
42 CharsetDetector detector =
new CharsetDetector();
44 InputStream stream =
new BufferedInputStream(
new ReadContentInputStream(source));
46 detector.setText(stream);
47 }
catch (IOException ex) {
48 throw new TextFileExtractorException(
"Unable to get string from detected text in TextFileExtractor", ex);
50 CharsetMatch match = detector.detect();
52 throw new TextFileExtractorException(
"Unable to detect any matches using TextFileExtractor");
53 }
else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
54 throw new TextFileExtractorException(
"Text does not match any character set with a high enough confidence for TextFileExtractor");
57 return match.getReader();