19 package org.sleuthkit.autopsy.textextractors;
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Objects;
36 import java.util.concurrent.Callable;
37 import java.util.concurrent.ExecutorService;
38 import java.util.concurrent.Executors;
39 import java.util.concurrent.Future;
40 import java.util.concurrent.ThreadFactory;
41 import java.util.concurrent.TimeUnit;
42 import java.util.concurrent.TimeoutException;
43 import java.util.logging.Level;
44 import java.util.stream.Collectors;
45 import org.apache.tika.Tika;
46 import org.apache.tika.exception.TikaException;
47 import org.apache.tika.metadata.Metadata;
48 import org.apache.tika.parser.AutoDetectParser;
49 import org.apache.tika.parser.ParseContext;
50 import org.apache.tika.parser.Parser;
51 import org.apache.tika.parser.ParsingReader;
52 import org.apache.tika.parser.microsoft.OfficeParserConfig;
53 import org.apache.tika.parser.ocr.TesseractOCRConfig;
54 import org.apache.tika.parser.pdf.PDFParserConfig;
55 import org.openide.util.NbBundle;
56 import org.openide.modules.InstalledFileLocator;
57 import org.openide.util.Lookup;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
72 import org.xml.sax.helpers.DefaultHandler;
73 import com.google.common.collect.ImmutableMap;
74 import java.io.InputStreamReader;
75 import java.nio.charset.Charset;
76 import java.util.ArrayList;
77 import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
85 final class TikaTextExtractor
implements TextExtractor {
89 private static final List<String> BINARY_MIME_TYPES
92 "application/octet-stream",
93 "application/x-msdownload");
99 private static final List<String> ARCHIVE_MIME_TYPES
102 "application/x-7z-compressed",
103 "application/x-ace-compressed",
104 "application/x-alz-compressed",
106 "application/vnd.ms-cab-compressed",
107 "application/x-cfs-compressed",
108 "application/x-dgc-compressed",
109 "application/x-apple-diskimage",
110 "application/x-gca-compressed",
114 "application/x-rar-compressed",
115 "application/x-stuffit",
116 "application/x-stuffitx",
117 "application/x-gtar",
118 "application/x-archive",
119 "application/x-executable",
120 "application/x-gzip",
123 "application/x-cpio",
124 "application/x-shar",
126 "application/x-bzip",
127 "application/x-bzip2",
128 "application/x-lzip",
129 "application/x-lzma",
130 "application/x-lzop",
132 "application/x-compress");
135 private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger(
"Tika");
136 private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
137 private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
138 private final ThreadFactory tikaThreadFactory
139 =
new ThreadFactoryBuilder().setNameFormat(
"tika-reader-%d").build();
140 private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
141 private static final String SQLITE_MIMETYPE =
"application/x-sqlite3";
143 private final AutoDetectParser parser =
new AutoDetectParser();
144 private final Content content;
146 private boolean tesseractOCREnabled;
147 private boolean limitedOCREnabled;
148 private static final String TESSERACT_DIR_NAME =
"Tesseract-OCR";
149 private static final String TESSERACT_EXECUTABLE =
"tesseract.exe";
150 private static final File TESSERACT_PATH = locateTesseractExecutable();
151 private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
152 private static final String TESSERACT_OUTPUT_FILE_NAME =
"tess_output";
153 private Map<String, String> metadataMap;
155 private ProcessTerminator processTerminator;
157 private static final List<String> TIKA_SUPPORTED_TYPES
158 =
new Tika().getParser().getSupportedTypes(
new ParseContext())
160 .map(mt -> mt.getType() +
"/" + mt.getSubtype())
161 .collect(Collectors.toList());
163 TikaTextExtractor(Content content) {
164 this.content = content;
174 private boolean ocrEnabled() {
175 return TESSERACT_PATH != null && tesseractOCREnabled
176 && PlatformUtil.isWindowsOS() ==
true && PlatformUtil.is64BitOS();
191 public Reader getReader() throws InitReaderException {
192 if (!this.isSupported()) {
193 throw new InitReaderException(
"Content is not supported");
197 final AbstractFile file = ((AbstractFile) content);
199 final String mimeType = file.getMIMEType();
203 if (ocrEnabled() && mimeType.toLowerCase().startsWith(
"image/") && useOcrOnFile(file)) {
204 InputStream imageOcrStream = performOCR(file);
205 return new InputStreamReader(imageOcrStream, Charset.forName(
"UTF-8"));
209 final InputStream stream =
new ReadContentInputStream(content);
210 final ParseContext parseContext =
new ParseContext();
215 parseContext.set(Parser.class, parser);
219 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
220 officeParserConfig.setUseSAXPptxExtractor(
true);
221 officeParserConfig.setUseSAXDocxExtractor(
true);
222 parseContext.set(OfficeParserConfig.class, officeParserConfig);
224 if (ocrEnabled() && useOcrOnFile(file)) {
227 TesseractOCRConfig ocrConfig =
new TesseractOCRConfig();
228 String tesseractFolder = TESSERACT_PATH.getParent();
229 ocrConfig.setTesseractPath(tesseractFolder);
230 ocrConfig.setLanguage(languagePacks);
231 ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
232 parseContext.set(TesseractOCRConfig.class, ocrConfig);
235 PDFParserConfig pdfConfig =
new PDFParserConfig();
243 pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
244 parseContext.set(PDFParserConfig.class, pdfConfig);
247 Metadata metadata =
new Metadata();
249 Future<Reader> future = executorService.submit(
250 new GetTikaReader(parser, stream, metadata, parseContext));
252 final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
254 PushbackReader pushbackReader =
new PushbackReader(tikaReader);
255 int read = pushbackReader.read();
257 throw new InitReaderException(
"Unable to extract text: "
258 +
"Tika returned empty reader for " + content);
260 pushbackReader.unread(read);
263 if (metadataMap == null) {
264 metadataMap =
new HashMap<>();
265 for (String mtdtKey : metadata.names()) {
266 metadataMap.put(mtdtKey, metadata.get(mtdtKey));
270 return new ReaderCharSource(pushbackReader).openStream();
271 }
catch (TimeoutException te) {
272 final String msg = NbBundle.getMessage(this.getClass(),
273 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
274 content.getId(), content.getName());
275 throw new InitReaderException(msg, te);
276 }
catch (InitReaderException ex) {
278 }
catch (Exception ex) {
279 AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error with file [id=%d] %s, see Tika log for details...",
280 content.getId(), content.getName()));
281 TIKA_LOGGER.log(Level.WARNING,
"Exception: Unable to Tika parse the "
282 +
"content" + content.getId() +
": " + content.getName(),
284 final String msg = NbBundle.getMessage(this.getClass(),
285 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
286 content.getId(), content.getName());
287 throw new InitReaderException(msg, ex);
303 private InputStream performOCR(AbstractFile file)
throws InitReaderException {
304 File inputFile = null;
305 File outputFile = null;
307 String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
310 String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
311 inputFile = Paths.get(tempDirectory, tempFileName).toFile();
312 ContentUtils.writeToFile(content, inputFile);
314 String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
315 String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
316 String executeablePath = TESSERACT_PATH.toString();
319 ProcessBuilder process =
new ProcessBuilder();
320 process.command(executeablePath,
321 String.format(
"\"%s\"", inputFile.getAbsolutePath()),
322 String.format(
"\"%s\"", outputFilePath),
323 "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
325 "-l", languagePacks);
329 if (processTerminator != null) {
330 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
332 ExecUtil.execute(process);
335 outputFile =
new File(outputFilePath +
".txt");
337 return new CleanUpStream(outputFile);
338 }
catch (NoCurrentCaseException | IOException ex) {
339 if (outputFile != null) {
342 throw new InitReaderException(
"Could not successfully run Tesseract", ex);
344 if (inputFile != null) {
362 private boolean useOcrOnFile(AbstractFile file) {
363 return !limitedOCREnabled || file.getSize() > LIMITED_OCR_SIZE_MIN || file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
378 Metadata metadata, ParseContext parseContext) {
386 public Reader
call() throws Exception {
387 return new ParsingReader(parser, stream, metadata, parseContext);
418 public void close() throws IOException {
435 private static File locateTesseractExecutable() {
440 String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
441 File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(),
false);
442 if (null == exeFile) {
446 if (!exeFile.canExecute()) {
459 public Map<String, String> getMetadata() {
460 if (metadataMap != null) {
461 return ImmutableMap.copyOf(metadataMap);
465 metadataMap =
new HashMap<>();
466 InputStream stream =
new ReadContentInputStream(content);
467 ContentHandler doNothingContentHandler =
new DefaultHandler();
468 Metadata mtdt =
new Metadata();
469 parser.parse(stream, doNothingContentHandler, mtdt);
470 for (String mtdtKey : mtdt.names()) {
471 metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
473 }
catch (IOException | SAXException | TikaException ex) {
474 AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error getting metadata for file [id=%d] %s, see Tika log for details...",
475 content.getId(), content.getName()));
476 TIKA_LOGGER.log(Level.WARNING,
"Exception: Unable to get metadata for "
477 +
"content" + content.getId() +
": " + content.getName(), ex);
489 public boolean isSupported() {
490 if (!(content instanceof AbstractFile)) {
494 String detectedType = ((AbstractFile) content).getMIMEType();
495 if (detectedType == null
496 || BINARY_MIME_TYPES.contains(detectedType)
497 || ARCHIVE_MIME_TYPES.contains(detectedType)
498 || (detectedType.startsWith(
"video/") && !detectedType.equals(
"video/x-flv"))
499 || detectedType.equals(SQLITE_MIMETYPE)
504 return TIKA_SUPPORTED_TYPES.contains(detectedType);
512 private static String formatLanguagePacks(List<String> languagePacks) {
513 return String.join(
"+", languagePacks);
523 private static int getTimeout(
long size) {
524 if (size < 1024 * 1024L)
527 }
else if (size < 10 * 1024 * 1024L)
530 }
else if (size < 100 * 1024 * 1024L)
549 public void setExtractionSettings(Lookup context) {
550 if (context != null) {
551 List<ProcessTerminator> terminators =
new ArrayList<>();
552 ImageConfig configInstance = context.lookup(ImageConfig.class);
553 if (configInstance != null) {
554 this.tesseractOCREnabled = configInstance.getOCREnabled();
555 this.limitedOCREnabled = configInstance.getLimitedOCREnabled();
557 if (Objects.nonNull(configInstance.getOCRLanguages())) {
558 this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
561 terminators.add(configInstance.getOCRTimeoutTerminator());
564 ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
565 if (terminatorInstance != null) {
566 terminators.add(terminatorInstance);
569 if (!terminators.isEmpty()) {
570 this.processTerminator =
new HybridTerminator(terminators);