19 package org.sleuthkit.autopsy.textreaders;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.Arrays;
33 import java.util.HashSet;
34 import java.util.List;
35 import java.util.Objects;
36 import java.util.concurrent.Callable;
37 import java.util.concurrent.ExecutorService;
38 import java.util.concurrent.Executors;
39 import java.util.concurrent.Future;
40 import java.util.concurrent.ThreadFactory;
41 import java.util.concurrent.TimeUnit;
42 import java.util.concurrent.TimeoutException;
43 import java.util.logging.Level;
44 import java.util.stream.Collectors;
45 import java.util.stream.Stream;
46 import org.apache.commons.io.FilenameUtils;
47 import org.apache.tika.Tika;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.parser.AutoDetectParser;
50 import org.apache.tika.parser.ParseContext;
51 import org.apache.tika.parser.Parser;
52 import org.apache.tika.parser.ParsingReader;
53 import org.apache.tika.parser.microsoft.OfficeParserConfig;
54 import org.apache.tika.parser.ocr.TesseractOCRConfig;
55 import org.apache.tika.parser.pdf.PDFParserConfig;
56 import org.openide.util.NbBundle;
57 import org.openide.modules.InstalledFileLocator;
58 import org.openide.util.Lookup;
74 final class TikaTextExtractor
extends TextExtractor {
78 private static final List<String> BINARY_MIME_TYPES
81 "application/octet-stream",
82 "application/x-msdownload");
88 private static final List<String> ARCHIVE_MIME_TYPES
91 "application/x-7z-compressed",
92 "application/x-ace-compressed",
93 "application/x-alz-compressed",
95 "application/vnd.ms-cab-compressed",
96 "application/x-cfs-compressed",
97 "application/x-dgc-compressed",
98 "application/x-apple-diskimage",
99 "application/x-gca-compressed",
103 "application/x-rar-compressed",
104 "application/x-stuffit",
105 "application/x-stuffitx",
106 "application/x-gtar",
107 "application/x-archive",
108 "application/x-executable",
109 "application/x-gzip",
112 "application/x-cpio",
113 "application/x-shar",
115 "application/x-bzip",
116 "application/x-bzip2",
117 "application/x-lzip",
118 "application/x-lzma",
119 "application/x-lzop",
121 "application/x-compress");
123 private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger(
"Tika");
125 private final ThreadFactory tikaThreadFactory =
126 new ThreadFactoryBuilder().setNameFormat(
"tika-reader-%d").build();
127 private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
128 private static final String SQLITE_MIMETYPE =
"application/x-sqlite3";
130 private final AutoDetectParser parser =
new AutoDetectParser();
131 private final Content content;
133 private boolean tesseractOCREnabled;
134 private static final String TESSERACT_DIR_NAME =
"Tesseract-OCR";
135 private static final String TESSERACT_EXECUTABLE =
"tesseract.exe";
136 private static final File TESSERACT_PATH = locateTesseractExecutable();
137 private static final String LANGUAGE_PACKS = getLanguagePacks();
138 private ProcessTerminator processTerminator;
139 private static final String TESSERACT_OUTPUT_FILE_NAME =
"output";
141 private static final List<String> TIKA_SUPPORTED_TYPES
142 =
new Tika().getParser().getSupportedTypes(
new ParseContext())
144 .map(mt -> mt.getType() +
"/" + mt.getSubtype())
145 .collect(Collectors.toList());
147 public TikaTextExtractor(Content content) {
148 this.content = content;
158 private boolean ocrEnabled() {
159 return TESSERACT_PATH != null && tesseractOCREnabled
160 && PlatformUtil.isWindowsOS() ==
true;
175 public Reader getReader() throws ExtractionException {
176 InputStream stream = null;
178 ParseContext parseContext =
new ParseContext();
179 parseContext.set(Parser.class, parser);
181 if (ocrEnabled() && content instanceof AbstractFile) {
182 AbstractFile file = ((AbstractFile) content);
184 if (file.getMIMEType().toLowerCase().startsWith(
"image/")) {
185 stream = runOcrAndGetOutputStream(file);
189 PDFParserConfig pdfConfig =
new PDFParserConfig();
194 pdfConfig.setExtractInlineImages(
true);
196 pdfConfig.setExtractUniqueInlineImagesOnly(
true);
197 parseContext.set(PDFParserConfig.class, pdfConfig);
200 TesseractOCRConfig ocrConfig =
new TesseractOCRConfig();
201 String tesseractFolder = TESSERACT_PATH.getParent();
202 ocrConfig.setTesseractPath(tesseractFolder);
210 ocrConfig.setLanguage(LANGUAGE_PACKS);
211 parseContext.set(TesseractOCRConfig.class, ocrConfig);
213 stream =
new ReadContentInputStream(content);
216 stream =
new ReadContentInputStream(content);
219 Metadata metadata =
new Metadata();
222 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
223 officeParserConfig.setUseSAXPptxExtractor(
true);
224 officeParserConfig.setUseSAXDocxExtractor(
true);
225 parseContext.set(OfficeParserConfig.class, officeParserConfig);
228 Future<Reader> future = executorService.submit(
229 new GetTikaReader(parser, stream, metadata, parseContext));
231 final Reader tikaReader = future.get(getTimeout(content.getSize()),
234 PushbackReader pushbackReader =
new PushbackReader(tikaReader);
235 int read = pushbackReader.read();
237 throw new ExtractionException(
"Unable to extract text: "
238 +
"Tika returned empty reader for " + content);
240 pushbackReader.unread(read);
243 CharSource metaDataCharSource = getMetaDataCharSource(metadata);
244 return CharSource.concat(
new ReaderCharSource(pushbackReader),
245 metaDataCharSource).openStream();
246 }
catch (TimeoutException te) {
247 final String msg = NbBundle.getMessage(this.getClass(),
248 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
249 content.getId(), content.getName());
250 throw new ExtractionException(msg, te);
251 }
catch (ExtractionException ex) {
253 }
catch (Exception ex) {
254 tikaLogger.log(Level.WARNING,
"Exception: Unable to Tika parse the "
255 +
"content" + content.getId() +
": " + content.getName(),
257 final String msg = NbBundle.getMessage(this.getClass(),
258 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
259 content.getId(), content.getName());
260 throw new ExtractionException(msg, ex);
276 private InputStream runOcrAndGetOutputStream(AbstractFile file)
throws ExtractionException {
277 File inputFile = null;
278 File outputFile = null;
281 String tempFileName = file.getId() + file.getName();
282 inputFile = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
283 tempFileName).toFile();
284 ContentUtils.writeToFile(content, inputFile);
286 String tempOutputName = file.getId() + TESSERACT_OUTPUT_FILE_NAME;
287 String outputFilePath = Paths.get(Case.getCurrentCaseThrows().getTempDirectory(),
288 tempOutputName).toString();
289 String executeablePath = TESSERACT_PATH.toString();
292 ProcessBuilder process =
new ProcessBuilder();
293 process.command(executeablePath,
294 String.format(
"\"%s\"", inputFile.getAbsolutePath()),
295 String.format(
"\"%s\"", outputFilePath),
297 "-l", LANGUAGE_PACKS);
301 if (processTerminator != null) {
302 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
304 ExecUtil.execute(process);
307 outputFile =
new File(outputFilePath +
".txt");
309 return new CleanUpStream(outputFile);
310 }
catch (NoCurrentCaseException | IOException ex) {
311 if (outputFile != null) {
314 throw new ExtractionException(
"Could not successfully run Tesseract", ex);
316 if (inputFile != null) {
333 Metadata metadata, ParseContext parseContext) {
341 public Reader
call() throws Exception {
342 return new ParsingReader(parser, stream, metadata, parseContext);
372 public void close() throws IOException {
389 private static File locateTesseractExecutable() {
394 String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
395 File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(),
false);
396 if (null == exeFile) {
400 if (!exeFile.canExecute()) {
415 static private CharSource getMetaDataCharSource(Metadata metadata) {
416 return CharSource.wrap(
417 new StringBuilder(
"\n\n------------------------------METADATA------------------------------\n\n")
418 .append(Stream.of(metadata.names()).sorted()
419 .map(key -> key +
": " + metadata.get(key))
420 .collect(Collectors.joining(
"\n"))
433 public boolean isSupported(Content content, String detectedFormat) {
434 if (detectedFormat == null
435 || BINARY_MIME_TYPES.contains(detectedFormat)
436 || ARCHIVE_MIME_TYPES.contains(detectedFormat)
437 || (detectedFormat.startsWith(
"video/") && !detectedFormat.equals(
"video/x-flv"))
438 || detectedFormat.equals(SQLITE_MIMETYPE)
442 return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
451 private static String getLanguagePacks() {
452 File languagePackRootDir =
new File(TESSERACT_PATH.getParent(),
"tessdata");
456 List<String> acceptableExtensions = Arrays.asList(
"traineddata",
"params",
457 "lm",
"fold",
"bigrams",
"nn",
"word-freq",
"size",
458 "user-patterns",
"user-words");
460 HashSet<String> languagePacks =
new HashSet<>();
461 if (languagePackRootDir.exists()) {
462 for (File languagePack : languagePackRootDir.listFiles()) {
463 if (languagePack.isDirectory() || !acceptableExtensions.contains(
464 FilenameUtils.getExtension(languagePack.getName()))) {
467 String threeLetterPackageName = languagePack.getName().substring(0, 3);
469 languagePacks.add(threeLetterPackageName);
472 return String.join(
"+", languagePacks);
482 private static int getTimeout(
long size) {
483 if (size < 1024 * 1024L)
486 }
else if (size < 10 * 1024 * 1024L)
489 }
else if (size < 100 * 1024 * 1024L)
508 public void setExtractionSettings(Lookup context) {
509 if (context != null) {
510 ImageConfig configInstance = context.lookup(ImageConfig.class);
511 if (configInstance != null && Objects.nonNull(configInstance.getOCREnabled())) {
512 this.tesseractOCREnabled = configInstance.getOCREnabled();
515 ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
516 if (terminatorInstance != null) {
517 this.processTerminator = terminatorInstance;