Autopsy  4.10.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.List;
33 import java.util.Objects;
34 import java.util.concurrent.Callable;
35 import java.util.concurrent.ExecutorService;
36 import java.util.concurrent.Executors;
37 import java.util.concurrent.Future;
38 import java.util.concurrent.ThreadFactory;
39 import java.util.concurrent.TimeUnit;
40 import java.util.concurrent.TimeoutException;
41 import java.util.logging.Level;
42 import java.util.stream.Collectors;
43 import java.util.stream.Stream;
44 import org.apache.tika.Tika;
45 import org.apache.tika.metadata.Metadata;
46 import org.apache.tika.parser.AutoDetectParser;
47 import org.apache.tika.parser.ParseContext;
48 import org.apache.tika.parser.Parser;
49 import org.apache.tika.parser.ParsingReader;
50 import org.apache.tika.parser.microsoft.OfficeParserConfig;
51 import org.apache.tika.parser.ocr.TesseractOCRConfig;
52 import org.apache.tika.parser.pdf.PDFParserConfig;
53 import org.openide.util.NbBundle;
54 import org.openide.modules.InstalledFileLocator;
55 import org.openide.util.Lookup;
64 import org.sleuthkit.datamodel.AbstractFile;
65 import org.sleuthkit.datamodel.Content;
66 import org.sleuthkit.datamodel.ReadContentInputStream;
67 
72 final class TikaTextExtractor implements TextExtractor {
73 
74  //Mimetype groups to aassist extractor implementations in ignoring binary and
75  //archive files.
76  private static final List<String> BINARY_MIME_TYPES
77  = ImmutableList.of(
78  //ignore binary blob data, for which string extraction will be used
79  "application/octet-stream", //NON-NLS
80  "application/x-msdownload"); //NON-NLS
81 
86  private static final List<String> ARCHIVE_MIME_TYPES
87  = ImmutableList.of(
88  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
89  "application/x-7z-compressed", //NON-NLS
90  "application/x-ace-compressed", //NON-NLS
91  "application/x-alz-compressed", //NON-NLS
92  "application/x-arj", //NON-NLS
93  "application/vnd.ms-cab-compressed", //NON-NLS
94  "application/x-cfs-compressed", //NON-NLS
95  "application/x-dgc-compressed", //NON-NLS
96  "application/x-apple-diskimage", //NON-NLS
97  "application/x-gca-compressed", //NON-NLS
98  "application/x-dar", //NON-NLS
99  "application/x-lzx", //NON-NLS
100  "application/x-lzh", //NON-NLS
101  "application/x-rar-compressed", //NON-NLS
102  "application/x-stuffit", //NON-NLS
103  "application/x-stuffitx", //NON-NLS
104  "application/x-gtar", //NON-NLS
105  "application/x-archive", //NON-NLS
106  "application/x-executable", //NON-NLS
107  "application/x-gzip", //NON-NLS
108  "application/zip", //NON-NLS
109  "application/x-zoo", //NON-NLS
110  "application/x-cpio", //NON-NLS
111  "application/x-shar", //NON-NLS
112  "application/x-tar", //NON-NLS
113  "application/x-bzip", //NON-NLS
114  "application/x-bzip2", //NON-NLS
115  "application/x-lzip", //NON-NLS
116  "application/x-lzma", //NON-NLS
117  "application/x-lzop", //NON-NLS
118  "application/x-z", //NON-NLS
119  "application/x-compress"); //NON-NLS
120 
121  private static final java.util.logging.Logger tikaLogger = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
122 
123  private final ThreadFactory tikaThreadFactory
124  = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
125  private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
126  private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
127 
128  private final AutoDetectParser parser = new AutoDetectParser();
129  private final Content content;
130 
131  private boolean tesseractOCREnabled;
132  private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
133  private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
134  private static final File TESSERACT_PATH = locateTesseractExecutable();
135  private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
136  private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
137 
138  private ProcessTerminator processTerminator;
139 
140  private static final List<String> TIKA_SUPPORTED_TYPES
141  = new Tika().getParser().getSupportedTypes(new ParseContext())
142  .stream()
143  .map(mt -> mt.getType() + "/" + mt.getSubtype())
144  .collect(Collectors.toList());
145 
146  public TikaTextExtractor(Content content) {
147  this.content = content;
148  }
149 
157  private boolean ocrEnabled() {
158  return TESSERACT_PATH != null && tesseractOCREnabled
159  && PlatformUtil.isWindowsOS() == true && PlatformUtil.is64BitOS();
160  }
161 
173  @Override
174  public Reader getReader() throws InitReaderException {
175  InputStream stream = null;
176 
177  ParseContext parseContext = new ParseContext();
178  parseContext.set(Parser.class, parser);
179 
180  if (ocrEnabled() && content instanceof AbstractFile) {
181  AbstractFile file = ((AbstractFile) content);
182  //Run OCR on images with Tesseract directly.
183  if (file.getMIMEType().toLowerCase().startsWith("image/")) {
184  stream = performOCR(file);
185  } else {
186  //Otherwise, go through Tika for PDFs so that it can
187  //extract images and run Tesseract on them.
188  PDFParserConfig pdfConfig = new PDFParserConfig();
189 
190  // Extracting the inline images and letting Tesseract run on each inline image.
191  // https://wiki.apache.org/tika/PDFParser%20%28Apache%20PDFBox%29
192  // https://tika.apache.org/1.7/api/org/apache/tika/parser/pdf/PDFParserConfig.html
193  pdfConfig.setExtractInlineImages(true);
194  // Multiple pages within a PDF file might refer to the same underlying image.
195  pdfConfig.setExtractUniqueInlineImagesOnly(true);
196  parseContext.set(PDFParserConfig.class, pdfConfig);
197 
198  // Configure Tesseract parser to perform OCR
199  TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
200  String tesseractFolder = TESSERACT_PATH.getParent();
201  ocrConfig.setTesseractPath(tesseractFolder);
202 
203  ocrConfig.setLanguage(languagePacks);
204  ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
205  parseContext.set(TesseractOCRConfig.class, ocrConfig);
206 
207  stream = new ReadContentInputStream(content);
208  }
209  } else {
210  stream = new ReadContentInputStream(content);
211  }
212 
213  Metadata metadata = new Metadata();
214  // Use the more memory efficient Tika SAX parsers for DOCX and
215  // PPTX files (it already uses SAX for XLSX).
216  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
217  officeParserConfig.setUseSAXPptxExtractor(true);
218  officeParserConfig.setUseSAXDocxExtractor(true);
219  parseContext.set(OfficeParserConfig.class, officeParserConfig);
220 
221  //Make the creation of a TikaReader a cancellable future in case it takes too long
222  Future<Reader> future = executorService.submit(
223  new GetTikaReader(parser, stream, metadata, parseContext));
224  try {
225  final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
226  //check if the reader is empty
227  PushbackReader pushbackReader = new PushbackReader(tikaReader);
228  int read = pushbackReader.read();
229  if (read == -1) {
230  throw new InitReaderException("Unable to extract text: "
231  + "Tika returned empty reader for " + content);
232  }
233  pushbackReader.unread(read);
234  //concatenate parsed content and meta data into a single reader.
235  CharSource metaDataCharSource = getMetaDataCharSource(metadata);
236  return CharSource.concat(new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
237  } catch (TimeoutException te) {
238  final String msg = NbBundle.getMessage(this.getClass(),
239  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
240  content.getId(), content.getName());
241  throw new InitReaderException(msg, te);
242  } catch (InitReaderException ex) {
243  throw ex;
244  } catch (Exception ex) {
245  tikaLogger.log(Level.WARNING, "Exception: Unable to Tika parse the "
246  + "content" + content.getId() + ": " + content.getName(),
247  ex.getCause()); //NON-NLS
248  final String msg = NbBundle.getMessage(this.getClass(),
249  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
250  content.getId(), content.getName());
251  throw new InitReaderException(msg, ex);
252  } finally {
253  future.cancel(true);
254  }
255  }
256 
267  private InputStream performOCR(AbstractFile file) throws InitReaderException {
268  File inputFile = null;
269  File outputFile = null;
270  try {
271  String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
272 
273  //Appending file id makes the name unique
274  String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
275  inputFile = Paths.get(tempDirectory, tempFileName).toFile();
276  ContentUtils.writeToFile(content, inputFile);
277 
278  String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
279  String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
280  String executeablePath = TESSERACT_PATH.toString();
281 
282  //Build tesseract commands
283  ProcessBuilder process = new ProcessBuilder();
284  process.command(executeablePath,
285  String.format("\"%s\"", inputFile.getAbsolutePath()),
286  String.format("\"%s\"", outputFilePath),
287  "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
288  //language pack command flag
289  "-l", languagePacks);
290 
291  //If the ProcessTerminator was supplied during
292  //configuration apply it here.
293  if (processTerminator != null) {
294  ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
295  } else {
296  ExecUtil.execute(process);
297  }
298 
299  outputFile = new File(outputFilePath + ".txt");
300  //Open a stream of the Tesseract text file and send this to Tika
301  return new CleanUpStream(outputFile);
302  } catch (NoCurrentCaseException | IOException ex) {
303  if (outputFile != null) {
304  outputFile.delete();
305  }
306  throw new InitReaderException("Could not successfully run Tesseract", ex);
307  } finally {
308  if (inputFile != null) {
309  inputFile.delete();
310  }
311  }
312  }
313 
318  private class GetTikaReader implements Callable<Reader> {
319 
320  private final AutoDetectParser parser;
321  private final InputStream stream;
322  private final Metadata metadata;
323  private final ParseContext parseContext;
324 
325  public GetTikaReader(AutoDetectParser parser, InputStream stream,
326  Metadata metadata, ParseContext parseContext) {
327  this.parser = parser;
328  this.stream = stream;
329  this.metadata = metadata;
330  this.parseContext = parseContext;
331  }
332 
333  @Override
334  public Reader call() throws Exception {
335  return new ParsingReader(parser, stream, metadata, parseContext);
336  }
337  }
338 
344  private class CleanUpStream extends FileInputStream {
345 
346  private File file;
347 
355  public CleanUpStream(File file) throws FileNotFoundException {
356  super(file);
357  this.file = file;
358  }
359 
365  @Override
366  public void close() throws IOException {
367  try {
368  super.close();
369  } finally {
370  if (file != null) {
371  file.delete();
372  file = null;
373  }
374  }
375  }
376  }
377 
383  private static File locateTesseractExecutable() {
384  if (!PlatformUtil.isWindowsOS()) {
385  return null;
386  }
387 
388  String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
389  File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
390  if (null == exeFile) {
391  return null;
392  }
393 
394  if (!exeFile.canExecute()) {
395  return null;
396  }
397 
398  return exeFile;
399  }
400 
409  static private CharSource getMetaDataCharSource(Metadata metadata) {
410  return CharSource.wrap(
411  new StringBuilder("\n\n------------------------------METADATA------------------------------\n\n")
412  .append(Stream.of(metadata.names()).sorted()
413  .map(key -> key + ": " + metadata.get(key))
414  .collect(Collectors.joining("\n"))
415  ));
416  }
417 
423  @Override
424  public boolean isSupported() {
425  if(!(content instanceof AbstractFile)) {
426  return false;
427  }
428 
429  String detectedType = ((AbstractFile)content).getMIMEType();
430  if (detectedType == null
431  || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
432  || ARCHIVE_MIME_TYPES.contains(detectedType)
433  || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
434  || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
435  ) {
436  return false;
437  }
438 
439  return TIKA_SUPPORTED_TYPES.contains(detectedType);
440  }
441 
447  private static String formatLanguagePacks(List<String> languagePacks) {
448  return String.join("+", languagePacks);
449  }
450 
458  private static int getTimeout(long size) {
459  if (size < 1024 * 1024L) //1MB
460  {
461  return 60;
462  } else if (size < 10 * 1024 * 1024L) //10MB
463  {
464  return 1200;
465  } else if (size < 100 * 1024 * 1024L) //100MB
466  {
467  return 3600;
468  } else {
469  return 3 * 3600;
470  }
471 
472  }
473 
483  @Override
484  public void setExtractionSettings(Lookup context) {
485  if (context != null) {
486  ImageConfig configInstance = context.lookup(ImageConfig.class);
487  if (configInstance != null) {
488  if(Objects.nonNull(configInstance.getOCREnabled())) {
489  this.tesseractOCREnabled = configInstance.getOCREnabled();
490  }
491 
492  if(Objects.nonNull(configInstance.getOCRLanguages())) {
493  this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
494  }
495  }
496 
497  ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
498  if (terminatorInstance != null) {
499  this.processTerminator = terminatorInstance;
500  }
501  }
502  }
503 
508  private static class ReaderCharSource extends CharSource {
509 
510  private final Reader reader;
511 
512  ReaderCharSource(Reader reader) {
513  this.reader = reader;
514  }
515 
516  @Override
517  public Reader openStream() throws IOException {
518  return reader;
519  }
520  }
521 }
GetTikaReader(AutoDetectParser parser, InputStream stream, Metadata metadata, ParseContext parseContext)

Copyright © 2012-2018 Basis Technology. Generated on: Fri Mar 22 2019
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.