19 package org.sleuthkit.autopsy.textextractors;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.List;
33 import java.util.Objects;
34 import java.util.concurrent.Callable;
35 import java.util.concurrent.ExecutorService;
36 import java.util.concurrent.Executors;
37 import java.util.concurrent.Future;
38 import java.util.concurrent.ThreadFactory;
39 import java.util.concurrent.TimeUnit;
40 import java.util.concurrent.TimeoutException;
41 import java.util.logging.Level;
42 import java.util.stream.Collectors;
43 import java.util.stream.Stream;
44 import org.apache.tika.Tika;
45 import org.apache.tika.metadata.Metadata;
46 import org.apache.tika.parser.AutoDetectParser;
47 import org.apache.tika.parser.ParseContext;
48 import org.apache.tika.parser.Parser;
49 import org.apache.tika.parser.ParsingReader;
50 import org.apache.tika.parser.microsoft.OfficeParserConfig;
51 import org.apache.tika.parser.ocr.TesseractOCRConfig;
52 import org.apache.tika.parser.pdf.PDFParserConfig;
53 import org.openide.util.NbBundle;
54 import org.openide.modules.InstalledFileLocator;
55 import org.openide.util.Lookup;
72 final class TikaTextExtractor
implements TextExtractor {
76 private static final List<String> BINARY_MIME_TYPES
79 "application/octet-stream",
80 "application/x-msdownload");
86 private static final List<String> ARCHIVE_MIME_TYPES
89 "application/x-7z-compressed",
90 "application/x-ace-compressed",
91 "application/x-alz-compressed",
93 "application/vnd.ms-cab-compressed",
94 "application/x-cfs-compressed",
95 "application/x-dgc-compressed",
96 "application/x-apple-diskimage",
97 "application/x-gca-compressed",
101 "application/x-rar-compressed",
102 "application/x-stuffit",
103 "application/x-stuffitx",
104 "application/x-gtar",
105 "application/x-archive",
106 "application/x-executable",
107 "application/x-gzip",
110 "application/x-cpio",
111 "application/x-shar",
113 "application/x-bzip",
114 "application/x-bzip2",
115 "application/x-lzip",
116 "application/x-lzma",
117 "application/x-lzop",
119 "application/x-compress");
121 private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger(
"Tika");
123 private final ThreadFactory tikaThreadFactory
124 =
new ThreadFactoryBuilder().setNameFormat(
"tika-reader-%d").build();
125 private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
126 private static final String SQLITE_MIMETYPE =
"application/x-sqlite3";
128 private final AutoDetectParser parser =
new AutoDetectParser();
129 private final Content content;
131 private boolean tesseractOCREnabled;
132 private static final String TESSERACT_DIR_NAME =
"Tesseract-OCR";
133 private static final String TESSERACT_EXECUTABLE =
"tesseract.exe";
134 private static final File TESSERACT_PATH = locateTesseractExecutable();
135 private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
136 private static final String TESSERACT_OUTPUT_FILE_NAME =
"tess_output";
138 private ProcessTerminator processTerminator;
140 private static final List<String> TIKA_SUPPORTED_TYPES
141 =
new Tika().getParser().getSupportedTypes(
new ParseContext())
143 .map(mt -> mt.getType() +
"/" + mt.getSubtype())
144 .collect(Collectors.toList());
146 public TikaTextExtractor(Content content) {
147 this.content = content;
157 private boolean ocrEnabled() {
158 return TESSERACT_PATH != null && tesseractOCREnabled
159 && PlatformUtil.isWindowsOS() ==
true && PlatformUtil.is64BitOS();
174 public Reader getReader() throws InitReaderException {
175 InputStream stream = null;
177 ParseContext parseContext =
new ParseContext();
178 parseContext.set(Parser.class, parser);
180 if (ocrEnabled() && content instanceof AbstractFile) {
181 AbstractFile file = ((AbstractFile) content);
183 if (file.getMIMEType().toLowerCase().startsWith(
"image/")) {
184 stream = performOCR(file);
188 PDFParserConfig pdfConfig =
new PDFParserConfig();
193 pdfConfig.setExtractInlineImages(
true);
195 pdfConfig.setExtractUniqueInlineImagesOnly(
true);
196 parseContext.set(PDFParserConfig.class, pdfConfig);
199 TesseractOCRConfig ocrConfig =
new TesseractOCRConfig();
200 String tesseractFolder = TESSERACT_PATH.getParent();
201 ocrConfig.setTesseractPath(tesseractFolder);
203 ocrConfig.setLanguage(languagePacks);
204 ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
205 parseContext.set(TesseractOCRConfig.class, ocrConfig);
207 stream =
new ReadContentInputStream(content);
210 stream =
new ReadContentInputStream(content);
213 Metadata metadata =
new Metadata();
216 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
217 officeParserConfig.setUseSAXPptxExtractor(
true);
218 officeParserConfig.setUseSAXDocxExtractor(
true);
219 parseContext.set(OfficeParserConfig.class, officeParserConfig);
222 Future<Reader> future = executorService.submit(
223 new GetTikaReader(parser, stream, metadata, parseContext));
225 final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
227 PushbackReader pushbackReader =
new PushbackReader(tikaReader);
228 int read = pushbackReader.read();
230 throw new InitReaderException(
"Unable to extract text: "
231 +
"Tika returned empty reader for " + content);
233 pushbackReader.unread(read);
235 CharSource metaDataCharSource = getMetaDataCharSource(metadata);
236 return CharSource.concat(
new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
237 }
catch (TimeoutException te) {
238 final String msg = NbBundle.getMessage(this.getClass(),
239 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
240 content.getId(), content.getName());
241 throw new InitReaderException(msg, te);
242 }
catch (InitReaderException ex) {
244 }
catch (Exception ex) {
245 tikaLogger.log(Level.WARNING,
"Exception: Unable to Tika parse the "
246 +
"content" + content.getId() +
": " + content.getName(),
248 final String msg = NbBundle.getMessage(this.getClass(),
249 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
250 content.getId(), content.getName());
251 throw new InitReaderException(msg, ex);
267 private InputStream performOCR(AbstractFile file)
throws InitReaderException {
268 File inputFile = null;
269 File outputFile = null;
271 String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
274 String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
275 inputFile = Paths.get(tempDirectory, tempFileName).toFile();
276 ContentUtils.writeToFile(content, inputFile);
278 String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
279 String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
280 String executeablePath = TESSERACT_PATH.toString();
283 ProcessBuilder process =
new ProcessBuilder();
284 process.command(executeablePath,
285 String.format(
"\"%s\"", inputFile.getAbsolutePath()),
286 String.format(
"\"%s\"", outputFilePath),
287 "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
289 "-l", languagePacks);
293 if (processTerminator != null) {
294 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
296 ExecUtil.execute(process);
299 outputFile =
new File(outputFilePath +
".txt");
301 return new CleanUpStream(outputFile);
302 }
catch (NoCurrentCaseException | IOException ex) {
303 if (outputFile != null) {
306 throw new InitReaderException(
"Could not successfully run Tesseract", ex);
308 if (inputFile != null) {
326 Metadata metadata, ParseContext parseContext) {
334 public Reader
call() throws Exception {
335 return new ParsingReader(parser, stream, metadata, parseContext);
366 public void close() throws IOException {
383 private static File locateTesseractExecutable() {
388 String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
389 File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(),
false);
390 if (null == exeFile) {
394 if (!exeFile.canExecute()) {
409 static private CharSource getMetaDataCharSource(Metadata metadata) {
410 return CharSource.wrap(
411 new StringBuilder(
"\n\n------------------------------METADATA------------------------------\n\n")
412 .append(Stream.of(metadata.names()).sorted()
413 .map(key -> key +
": " + metadata.get(key))
414 .collect(Collectors.joining(
"\n"))
424 public boolean isSupported() {
425 if(!(content instanceof AbstractFile)) {
429 String detectedType = ((AbstractFile)content).getMIMEType();
430 if (detectedType == null
431 || BINARY_MIME_TYPES.contains(detectedType)
432 || ARCHIVE_MIME_TYPES.contains(detectedType)
433 || (detectedType.startsWith(
"video/") && !detectedType.equals(
"video/x-flv"))
434 || detectedType.equals(SQLITE_MIMETYPE)
439 return TIKA_SUPPORTED_TYPES.contains(detectedType);
447 private static String formatLanguagePacks(List<String> languagePacks) {
448 return String.join(
"+", languagePacks);
458 private static int getTimeout(
long size) {
459 if (size < 1024 * 1024L)
462 }
else if (size < 10 * 1024 * 1024L)
465 }
else if (size < 100 * 1024 * 1024L)
484 public void setExtractionSettings(Lookup context) {
485 if (context != null) {
486 ImageConfig configInstance = context.lookup(ImageConfig.class);
487 if (configInstance != null) {
488 if(Objects.nonNull(configInstance.getOCREnabled())) {
489 this.tesseractOCREnabled = configInstance.getOCREnabled();
492 if(Objects.nonNull(configInstance.getOCRLanguages())) {
493 this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
497 ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
498 if (terminatorInstance != null) {
499 this.processTerminator = terminatorInstance;