Autopsy  4.18.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2020 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.common.io.CharSource;
23 import com.google.common.util.concurrent.ThreadFactoryBuilder;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PushbackReader;
30 import java.io.Reader;
31 import java.nio.file.Paths;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Objects;
35 import java.util.Map;
36 import java.util.concurrent.Callable;
37 import java.util.concurrent.ExecutorService;
38 import java.util.concurrent.Executors;
39 import java.util.concurrent.Future;
40 import java.util.concurrent.ThreadFactory;
41 import java.util.concurrent.TimeUnit;
42 import java.util.concurrent.TimeoutException;
43 import java.util.logging.Level;
44 import java.util.stream.Collectors;
45 import org.apache.tika.Tika;
46 import org.apache.tika.exception.TikaException;
47 import org.apache.tika.metadata.Metadata;
48 import org.apache.tika.parser.AutoDetectParser;
49 import org.apache.tika.parser.ParseContext;
50 import org.apache.tika.parser.Parser;
51 import org.apache.tika.parser.ParsingReader;
52 import org.apache.tika.parser.microsoft.OfficeParserConfig;
53 import org.apache.tika.parser.ocr.TesseractOCRConfig;
54 import org.apache.tika.parser.pdf.PDFParserConfig;
55 import org.openide.util.NbBundle;
56 import org.openide.modules.InstalledFileLocator;
57 import org.openide.util.Lookup;
67 import org.sleuthkit.datamodel.AbstractFile;
68 import org.sleuthkit.datamodel.Content;
69 import org.sleuthkit.datamodel.ReadContentInputStream;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
72 import org.xml.sax.helpers.DefaultHandler;
73 import com.google.common.collect.ImmutableMap;
74 import java.io.InputStreamReader;
75 import java.nio.charset.Charset;
76 import java.util.ArrayList;
77 import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
79 import org.sleuthkit.datamodel.TskData;
80 
85 final class TikaTextExtractor implements TextExtractor {
86 
87  //Mimetype groups to aassist extractor implementations in ignoring binary and
88  //archive files.
89  private static final List<String> BINARY_MIME_TYPES
90  = ImmutableList.of(
91  //ignore binary blob data, for which string extraction will be used
92  "application/octet-stream", //NON-NLS
93  "application/x-msdownload"); //NON-NLS
94 
99  private static final List<String> ARCHIVE_MIME_TYPES
100  = ImmutableList.of(
101  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
102  "application/x-7z-compressed", //NON-NLS
103  "application/x-ace-compressed", //NON-NLS
104  "application/x-alz-compressed", //NON-NLS
105  "application/x-arj", //NON-NLS
106  "application/vnd.ms-cab-compressed", //NON-NLS
107  "application/x-cfs-compressed", //NON-NLS
108  "application/x-dgc-compressed", //NON-NLS
109  "application/x-apple-diskimage", //NON-NLS
110  "application/x-gca-compressed", //NON-NLS
111  "application/x-dar", //NON-NLS
112  "application/x-lzx", //NON-NLS
113  "application/x-lzh", //NON-NLS
114  "application/x-rar-compressed", //NON-NLS
115  "application/x-stuffit", //NON-NLS
116  "application/x-stuffitx", //NON-NLS
117  "application/x-gtar", //NON-NLS
118  "application/x-archive", //NON-NLS
119  "application/x-executable", //NON-NLS
120  "application/x-gzip", //NON-NLS
121  "application/zip", //NON-NLS
122  "application/x-zoo", //NON-NLS
123  "application/x-cpio", //NON-NLS
124  "application/x-shar", //NON-NLS
125  "application/x-tar", //NON-NLS
126  "application/x-bzip", //NON-NLS
127  "application/x-bzip2", //NON-NLS
128  "application/x-lzip", //NON-NLS
129  "application/x-lzma", //NON-NLS
130  "application/x-lzop", //NON-NLS
131  "application/x-z", //NON-NLS
132  "application/x-compress"); //NON-NLS
133 
134  // Used to log to the tika file that is why it uses the java.util.logging.logger class instead of the Autopsy one
135  private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
136  private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
137  private static final int LIMITED_OCR_SIZE_MIN = 100 * 1024;
138  private final ThreadFactory tikaThreadFactory
139  = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
140  private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
141  private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
142 
143  private final AutoDetectParser parser = new AutoDetectParser();
144  private final Content content;
145 
146  private boolean tesseractOCREnabled;
147  private boolean limitedOCREnabled;
148  private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
149  private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
150  private static final File TESSERACT_PATH = locateTesseractExecutable();
151  private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
152  private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
153  private Map<String, String> metadataMap;
154 
155  private ProcessTerminator processTerminator;
156 
157  private static final List<String> TIKA_SUPPORTED_TYPES
158  = new Tika().getParser().getSupportedTypes(new ParseContext())
159  .stream()
160  .map(mt -> mt.getType() + "/" + mt.getSubtype())
161  .collect(Collectors.toList());
162 
163  TikaTextExtractor(Content content) {
164  this.content = content;
165  }
166 
174  private boolean ocrEnabled() {
175  return TESSERACT_PATH != null && tesseractOCREnabled
176  && PlatformUtil.isWindowsOS() == true && PlatformUtil.is64BitOS();
177  }
178 
190  @Override
191  public Reader getReader() throws InitReaderException {
192  if (!this.isSupported()) {
193  throw new InitReaderException("Content is not supported");
194  }
195 
196  // Only abstract files are supported, see isSupported()
197  final AbstractFile file = ((AbstractFile) content);
198  // This mime type must be non-null, see isSupported()
199  final String mimeType = file.getMIMEType();
200 
201  // Handle images seperately so the OCR task can be cancelled.
202  // See JIRA-4519 for the need to have cancellation in the UI and ingest.
203  if (ocrEnabled() && mimeType.toLowerCase().startsWith("image/") && useOcrOnFile(file)) {
204  InputStream imageOcrStream = performOCR(file);
205  return new InputStreamReader(imageOcrStream, Charset.forName("UTF-8"));
206  }
207 
208  // Set up Tika
209  final InputStream stream = new ReadContentInputStream(content);
210  final ParseContext parseContext = new ParseContext();
211 
212  // Documents can contain other documents. By adding
213  // the parser back into the context, Tika will recursively
214  // parse embedded documents.
215  parseContext.set(Parser.class, parser);
216 
217  // Use the more memory efficient Tika SAX parsers for DOCX and
218  // PPTX files (it already uses SAX for XLSX).
219  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
220  officeParserConfig.setUseSAXPptxExtractor(true);
221  officeParserConfig.setUseSAXDocxExtractor(true);
222  parseContext.set(OfficeParserConfig.class, officeParserConfig);
223 
224  if (ocrEnabled() && useOcrOnFile(file)) {
225  // Configure OCR for Tika if it chooses to run OCR
226  // during extraction
227  TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
228  String tesseractFolder = TESSERACT_PATH.getParent();
229  ocrConfig.setTesseractPath(tesseractFolder);
230  ocrConfig.setLanguage(languagePacks);
231  ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());
232  parseContext.set(TesseractOCRConfig.class, ocrConfig);
233 
234  // Configure how Tika handles OCRing PDFs
235  PDFParserConfig pdfConfig = new PDFParserConfig();
236 
237  // This stategy tries to pick between OCRing a page in the
238  // PDF and doing text extraction. It makes this choice by
239  // first running text extraction and then counting characters.
240  // If there are too few characters or too many unmapped
241  // unicode characters, it'll run the entire page through OCR
242  // and take that output instead. See JIRA-6938
243  pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
244  parseContext.set(PDFParserConfig.class, pdfConfig);
245  }
246 
247  Metadata metadata = new Metadata();
248  //Make the creation of a TikaReader a cancellable future in case it takes too long
249  Future<Reader> future = executorService.submit(
250  new GetTikaReader(parser, stream, metadata, parseContext));
251  try {
252  final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
253  //check if the reader is empty
254  PushbackReader pushbackReader = new PushbackReader(tikaReader);
255  int read = pushbackReader.read();
256  if (read == -1) {
257  throw new InitReaderException("Unable to extract text: "
258  + "Tika returned empty reader for " + content);
259  }
260  pushbackReader.unread(read);
261 
262  //Save the metadata if it has not been fetched already.
263  if (metadataMap == null) {
264  metadataMap = new HashMap<>();
265  for (String mtdtKey : metadata.names()) {
266  metadataMap.put(mtdtKey, metadata.get(mtdtKey));
267  }
268  }
269 
270  return new ReaderCharSource(pushbackReader).openStream();
271  } catch (TimeoutException te) {
272  final String msg = NbBundle.getMessage(this.getClass(),
273  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
274  content.getId(), content.getName());
275  throw new InitReaderException(msg, te);
276  } catch (InitReaderException ex) {
277  throw ex;
278  } catch (Exception ex) {
279  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error with file [id=%d] %s, see Tika log for details...",
280  content.getId(), content.getName()));
281  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to Tika parse the "
282  + "content" + content.getId() + ": " + content.getName(),
283  ex.getCause()); //NON-NLS
284  final String msg = NbBundle.getMessage(this.getClass(),
285  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
286  content.getId(), content.getName());
287  throw new InitReaderException(msg, ex);
288  } finally {
289  future.cancel(true);
290  }
291  }
292 
303  private InputStream performOCR(AbstractFile file) throws InitReaderException {
304  File inputFile = null;
305  File outputFile = null;
306  try {
307  String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
308 
309  //Appending file id makes the name unique
310  String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
311  inputFile = Paths.get(tempDirectory, tempFileName).toFile();
312  ContentUtils.writeToFile(content, inputFile);
313 
314  String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
315  String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
316  String executeablePath = TESSERACT_PATH.toString();
317 
318  //Build tesseract commands
319  ProcessBuilder process = new ProcessBuilder();
320  process.command(executeablePath,
321  String.format("\"%s\"", inputFile.getAbsolutePath()),
322  String.format("\"%s\"", outputFilePath),
323  "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
324  //language pack command flag
325  "-l", languagePacks);
326 
327  //If the ProcessTerminator was supplied during
328  //configuration apply it here.
329  if (processTerminator != null) {
330  ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
331  } else {
332  ExecUtil.execute(process);
333  }
334 
335  outputFile = new File(outputFilePath + ".txt");
336  //Open a stream of the Tesseract text file and send this to Tika
337  return new CleanUpStream(outputFile);
338  } catch (NoCurrentCaseException | IOException ex) {
339  if (outputFile != null) {
340  outputFile.delete();
341  }
342  throw new InitReaderException("Could not successfully run Tesseract", ex);
343  } finally {
344  if (inputFile != null) {
345  inputFile.delete();
346  }
347  }
348  }
349 
362  private boolean useOcrOnFile(AbstractFile file) {
363  return !limitedOCREnabled || file.getSize() > LIMITED_OCR_SIZE_MIN || file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.DERIVED;
364  }
365 
370  private class GetTikaReader implements Callable<Reader> {
371 
372  private final AutoDetectParser parser;
373  private final InputStream stream;
374  private final Metadata metadata;
375  private final ParseContext parseContext;
376 
377  GetTikaReader(AutoDetectParser parser, InputStream stream,
378  Metadata metadata, ParseContext parseContext) {
379  this.parser = parser;
380  this.stream = stream;
381  this.metadata = metadata;
382  this.parseContext = parseContext;
383  }
384 
385  @Override
386  public Reader call() throws Exception {
387  return new ParsingReader(parser, stream, metadata, parseContext);
388  }
389  }
390 
396  private class CleanUpStream extends FileInputStream {
397 
398  private File file;
399 
407  CleanUpStream(File file) throws FileNotFoundException {
408  super(file);
409  this.file = file;
410  }
411 
417  @Override
418  public void close() throws IOException {
419  try {
420  super.close();
421  } finally {
422  if (file != null) {
423  file.delete();
424  file = null;
425  }
426  }
427  }
428  }
429 
435  private static File locateTesseractExecutable() {
436  if (!PlatformUtil.isWindowsOS()) {
437  return null;
438  }
439 
440  String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
441  File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
442  if (null == exeFile) {
443  return null;
444  }
445 
446  if (!exeFile.canExecute()) {
447  return null;
448  }
449 
450  return exeFile;
451  }
452 
458  @Override
459  public Map<String, String> getMetadata() {
460  if (metadataMap != null) {
461  return ImmutableMap.copyOf(metadataMap);
462  }
463 
464  try {
465  metadataMap = new HashMap<>();
466  InputStream stream = new ReadContentInputStream(content);
467  ContentHandler doNothingContentHandler = new DefaultHandler();
468  Metadata mtdt = new Metadata();
469  parser.parse(stream, doNothingContentHandler, mtdt);
470  for (String mtdtKey : mtdt.names()) {
471  metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
472  }
473  } catch (IOException | SAXException | TikaException ex) {
474  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error getting metadata for file [id=%d] %s, see Tika log for details...", //NON-NLS
475  content.getId(), content.getName()));
476  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to get metadata for " //NON-NLS
477  + "content" + content.getId() + ": " + content.getName(), ex); //NON-NLS
478  }
479 
480  return metadataMap;
481  }
482 
488  @Override
489  public boolean isSupported() {
490  if (!(content instanceof AbstractFile)) {
491  return false;
492  }
493 
494  String detectedType = ((AbstractFile) content).getMIMEType();
495  if (detectedType == null
496  || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
497  || ARCHIVE_MIME_TYPES.contains(detectedType)
498  || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
499  || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
500  ) {
501  return false;
502  }
503 
504  return TIKA_SUPPORTED_TYPES.contains(detectedType);
505  }
506 
512  private static String formatLanguagePacks(List<String> languagePacks) {
513  return String.join("+", languagePacks);
514  }
515 
523  private static int getTimeout(long size) {
524  if (size < 1024 * 1024L) //1MB
525  {
526  return 60;
527  } else if (size < 10 * 1024 * 1024L) //10MB
528  {
529  return 1200;
530  } else if (size < 100 * 1024 * 1024L) //100MB
531  {
532  return 3600;
533  } else {
534  return 3 * 3600;
535  }
536 
537  }
538 
548  @Override
549  public void setExtractionSettings(Lookup context) {
550  if (context != null) {
551  List<ProcessTerminator> terminators = new ArrayList<>();
552  ImageConfig configInstance = context.lookup(ImageConfig.class);
553  if (configInstance != null) {
554  this.tesseractOCREnabled = configInstance.getOCREnabled();
555  this.limitedOCREnabled = configInstance.getLimitedOCREnabled();
556 
557  if (Objects.nonNull(configInstance.getOCRLanguages())) {
558  this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
559  }
560 
561  terminators.add(configInstance.getOCRTimeoutTerminator());
562  }
563 
564  ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
565  if (terminatorInstance != null) {
566  terminators.add(terminatorInstance);
567  }
568 
569  if (!terminators.isEmpty()) {
570  this.processTerminator = new HybridTerminator(terminators);
571  }
572  }
573  }
574 
579  private static class ReaderCharSource extends CharSource {
580 
581  private final Reader reader;
582 
583  ReaderCharSource(Reader reader) {
584  this.reader = reader;
585  }
586 
587  @Override
588  public Reader openStream() throws IOException {
589  return reader;
590  }
591  }
592 }

Copyright © 2012-2021 Basis Technology. Generated on: Thu Jul 8 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.