19 package org.sleuthkit.autopsy.textextractors;
21 import com.google.common.io.CharSource;
22 import com.google.common.util.concurrent.ThreadFactoryBuilder;
24 import java.io.FileInputStream;
25 import java.io.FileNotFoundException;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.PushbackReader;
29 import java.io.Reader;
30 import java.nio.file.Paths;
31 import java.util.HashMap;
32 import java.util.List;
33 import java.util.Objects;
35 import java.util.concurrent.Callable;
36 import java.util.concurrent.ExecutorService;
37 import java.util.concurrent.Executors;
38 import java.util.concurrent.Future;
39 import java.util.concurrent.ThreadFactory;
40 import java.util.concurrent.TimeUnit;
41 import java.util.concurrent.TimeoutException;
42 import java.util.logging.Level;
43 import java.util.stream.Collectors;
44 import org.apache.tika.Tika;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.metadata.Metadata;
47 import org.apache.tika.parser.AutoDetectParser;
48 import org.apache.tika.parser.ParseContext;
49 import org.apache.tika.parser.Parser;
50 import org.apache.tika.parser.ParsingReader;
51 import org.apache.tika.parser.microsoft.OfficeParserConfig;
52 import org.apache.tika.parser.ocr.TesseractOCRConfig;
53 import org.apache.tika.parser.pdf.PDFParserConfig;
54 import org.openide.util.NbBundle;
55 import org.openide.modules.InstalledFileLocator;
56 import org.openide.util.Lookup;
69 import org.xml.sax.ContentHandler;
70 import org.xml.sax.SAXException;
71 import org.xml.sax.helpers.DefaultHandler;
72 import com.google.common.collect.ImmutableMap;
73 import com.google.common.collect.ImmutableSet;
74 import java.io.InputStreamReader;
75 import java.nio.charset.Charset;
76 import java.util.ArrayList;
78 import org.apache.tika.mime.MimeTypes;
79 import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
87 final class TikaTextExtractor
implements TextExtractor {
91 private static final Set<String> BINARY_MIME_TYPES
94 "application/octet-stream",
95 "application/x-msdownload");
101 private static final Set<String> ARCHIVE_MIME_TYPES
104 "application/x-7z-compressed",
105 "application/x-ace-compressed",
106 "application/x-alz-compressed",
108 "application/vnd.ms-cab-compressed",
109 "application/x-cfs-compressed",
110 "application/x-dgc-compressed",
111 "application/x-apple-diskimage",
112 "application/x-gca-compressed",
116 "application/x-rar-compressed",
117 "application/x-stuffit",
118 "application/x-stuffitx",
119 "application/x-gtar",
120 "application/x-archive",
121 "application/x-executable",
122 "application/x-gzip",
125 "application/x-cpio",
126 "application/x-shar",
128 "application/x-bzip",
129 "application/x-bzip2",
130 "application/x-lzip",
131 "application/x-lzma",
132 "application/x-lzop",
134 "application/x-compress");
137 private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger(
"Tika");
138 private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
140 private final ThreadFactory tikaThreadFactory
141 =
new ThreadFactoryBuilder().setNameFormat(
"tika-reader-%d").build();
142 private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
143 private static final String SQLITE_MIMETYPE =
"application/x-sqlite3";
145 private final AutoDetectParser parser =
new AutoDetectParser();
146 private final FileTypeDetector fileTypeDetector;
147 private final Content content;
149 private boolean tesseractOCREnabled;
150 private static final String TESSERACT_DIR_NAME =
"Tesseract-OCR";
151 private static final String TESSERACT_EXECUTABLE =
"tesseract.exe";
152 private static final File TESSERACT_PATH = locateTesseractExecutable();
153 private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
154 private static final String TESSERACT_OUTPUT_FILE_NAME =
"tess_output";
157 private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
159 "application/msword",
160 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
161 "application/vnd.ms-powerpoint",
162 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
163 "application/vnd.ms-excel",
164 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
167 private static final String IMAGE_MIME_TYPE_PREFIX =
"image/";
169 private Map<String, String> metadataMap;
171 private ProcessTerminator processTerminator;
173 private static final List<String> TIKA_SUPPORTED_TYPES
174 =
new Tika().getParser().getSupportedTypes(
new ParseContext())
176 .map(mt -> mt.getType() +
"/" + mt.getSubtype())
177 .collect(Collectors.toList());
179 TikaTextExtractor(Content content) {
180 this.content = content;
182 FileTypeDetector detector = null;
184 detector =
new FileTypeDetector();
185 }
catch (FileTypeDetector.FileTypeDetectorInitException ex) {
186 TIKA_LOGGER.log(Level.SEVERE,
"Unable to instantiate a file type detector", ex);
188 this.fileTypeDetector = detector;
201 private String getMimeType(AbstractFile file) {
202 String mimeType = MimeTypes.OCTET_STREAM;
203 if (fileTypeDetector != null) {
204 mimeType = fileTypeDetector.getMIMEType(file);
205 }
else if (file.getMIMEType() != null) {
206 mimeType = file.getMIMEType();
209 return mimeType.trim().toLowerCase();
213 public boolean willUseOCR() {
214 if (!isOcrSupported() || (!(content instanceof AbstractFile))) {
218 String mimeType = getMimeType((AbstractFile) content);
220 return mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX) || OCR_DOCUMENTS.contains(mimeType);
228 private boolean isOcrSupported() {
232 return TESSERACT_PATH != null
233 && tesseractOCREnabled
234 && PlatformUtil.isWindowsOS()
235 && PlatformUtil.is64BitOS()
251 public Reader getReader() throws InitReaderException {
252 if (!this.isSupported()) {
253 throw new InitReaderException(
"Content is not supported");
257 final AbstractFile file = ((AbstractFile) content);
259 String mimeType = getMimeType(file);
263 if (isOcrSupported() && mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
264 InputStream imageOcrStream = performOCR(file);
265 return new InputStreamReader(imageOcrStream, Charset.forName(
"UTF-8"));
269 final InputStream stream =
new ReadContentInputStream(content);
271 final ParseContext parseContext =
new ParseContext();
275 parseContext.set(Parser.class, parser);
278 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
279 officeParserConfig.setUseSAXPptxExtractor(
true);
280 officeParserConfig.setUseSAXDocxExtractor(
true);
281 parseContext.set(OfficeParserConfig.class, officeParserConfig);
282 if (isOcrSupported()) {
285 TesseractOCRConfig ocrConfig =
new TesseractOCRConfig();
286 String tesseractFolder = TESSERACT_PATH.getParent();
287 ocrConfig.setTesseractPath(tesseractFolder);
288 ocrConfig.setLanguage(languagePacks);
289 ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
290 parseContext.set(TesseractOCRConfig.class, ocrConfig);
293 PDFParserConfig pdfConfig =
new PDFParserConfig();
301 pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
302 parseContext.set(PDFParserConfig.class, pdfConfig);
305 Metadata metadata =
new Metadata();
307 Future<Reader> future = executorService.submit(
308 new GetTikaReader(parser, stream, metadata, parseContext));
310 final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
312 PushbackReader pushbackReader =
new PushbackReader(tikaReader);
313 int read = pushbackReader.read();
315 throw new InitReaderException(
"Unable to extract text: "
316 +
"Tika returned empty reader for " + content);
318 pushbackReader.unread(read);
321 if (metadataMap == null) {
322 metadataMap =
new HashMap<>();
323 for (String mtdtKey : metadata.names()) {
324 metadataMap.put(mtdtKey, metadata.get(mtdtKey));
328 return new ReaderCharSource(pushbackReader).openStream();
329 }
catch (TimeoutException te) {
330 final String msg = NbBundle.getMessage(this.getClass(),
331 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
332 content.getId(), content.getName());
333 throw new InitReaderException(msg, te);
334 }
catch (InitReaderException ex) {
336 }
catch (Exception ex) {
337 AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error with file [id=%d] %s, see Tika log for details...",
338 content.getId(), content.getName()));
339 TIKA_LOGGER.log(Level.WARNING,
"Exception: Unable to Tika parse the "
340 +
"content" + content.getId() +
": " + content.getName(),
342 final String msg = NbBundle.getMessage(this.getClass(),
343 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
344 content.getId(), content.getName());
345 throw new InitReaderException(msg, ex);
361 private InputStream performOCR(AbstractFile file)
throws InitReaderException {
362 File inputFile = null;
363 File outputFile = null;
365 String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
368 String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
369 inputFile = Paths.get(tempDirectory, tempFileName).toFile();
370 ContentUtils.writeToFile(content, inputFile);
372 String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
373 String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
374 String executeablePath = TESSERACT_PATH.toString();
377 ProcessBuilder process =
new ProcessBuilder();
378 process.command(executeablePath,
379 String.format(
"\"%s\"", inputFile.getAbsolutePath()),
380 String.format(
"\"%s\"", outputFilePath),
381 "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
383 "-l", languagePacks);
387 if (processTerminator != null) {
388 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
390 ExecUtil.execute(process);
393 outputFile =
new File(outputFilePath +
".txt");
395 return new CleanUpStream(outputFile);
396 }
catch (NoCurrentCaseException | IOException ex) {
397 if (outputFile != null) {
400 throw new InitReaderException(
"Could not successfully run Tesseract", ex);
402 if (inputFile != null) {
420 Metadata metadata, ParseContext parseContext) {
428 public Reader
call() throws Exception {
429 return new ParsingReader(parser, stream, metadata, parseContext);
460 public void close() throws IOException {
477 private static File locateTesseractExecutable() {
482 String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
483 File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(),
false);
484 if (null == exeFile) {
488 if (!exeFile.canExecute()) {
501 public Map<String, String> getMetadata() {
502 if (metadataMap != null) {
503 return ImmutableMap.copyOf(metadataMap);
507 metadataMap =
new HashMap<>();
508 InputStream stream =
new ReadContentInputStream(content);
509 ContentHandler doNothingContentHandler =
new DefaultHandler();
510 Metadata mtdt =
new Metadata();
511 parser.parse(stream, doNothingContentHandler, mtdt);
512 for (String mtdtKey : mtdt.names()) {
513 metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
515 }
catch (IOException | SAXException | TikaException ex) {
516 AUTOPSY_LOGGER.log(Level.WARNING, String.format(
"Error getting metadata for file [id=%d] %s, see Tika log for details...",
517 content.getId(), content.getName()));
518 TIKA_LOGGER.log(Level.WARNING,
"Exception: Unable to get metadata for "
519 +
"content" + content.getId() +
": " + content.getName(), ex);
531 public boolean isSupported() {
532 if (!(content instanceof AbstractFile)) {
536 String detectedType = ((AbstractFile) content).getMIMEType();
537 if (detectedType == null
538 || BINARY_MIME_TYPES.contains(detectedType)
539 || ARCHIVE_MIME_TYPES.contains(detectedType)
540 || (detectedType.startsWith(
"video/") && !detectedType.equals(
"video/x-flv"))
541 || detectedType.equals(SQLITE_MIMETYPE)
546 return TIKA_SUPPORTED_TYPES.contains(detectedType);
554 private static String formatLanguagePacks(List<String> languagePacks) {
555 return String.join(
"+", languagePacks);
565 private static int getTimeout(
long size) {
566 if (size < 1024 * 1024L)
569 }
else if (size < 10 * 1024 * 1024L)
572 }
else if (size < 100 * 1024 * 1024L)
591 public void setExtractionSettings(Lookup context) {
592 if (context != null) {
593 List<ProcessTerminator> terminators =
new ArrayList<>();
594 ImageConfig configInstance = context.lookup(ImageConfig.class);
595 if (configInstance != null) {
596 this.tesseractOCREnabled = configInstance.getOCREnabled();
598 if (Objects.nonNull(configInstance.getOCRLanguages())) {
599 this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
602 terminators.add(configInstance.getOCRTimeoutTerminator());
605 ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
606 if (terminatorInstance != null) {
607 terminators.add(terminatorInstance);
610 if (!terminators.isEmpty()) {
611 this.processTerminator =
new HybridTerminator(terminators);