Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2018-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import com.google.common.io.CharSource;
22 import com.google.common.util.concurrent.ThreadFactoryBuilder;
23 import java.io.File;
24 import java.io.FileInputStream;
25 import java.io.FileNotFoundException;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.PushbackReader;
29 import java.io.Reader;
30 import java.nio.file.Paths;
31 import java.util.HashMap;
32 import java.util.List;
33 import java.util.Objects;
34 import java.util.Map;
35 import java.util.concurrent.Callable;
36 import java.util.concurrent.ExecutorService;
37 import java.util.concurrent.Executors;
38 import java.util.concurrent.Future;
39 import java.util.concurrent.ThreadFactory;
40 import java.util.concurrent.TimeUnit;
41 import java.util.concurrent.TimeoutException;
42 import java.util.logging.Level;
43 import java.util.stream.Collectors;
44 import org.apache.tika.Tika;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.metadata.Metadata;
47 import org.apache.tika.parser.AutoDetectParser;
48 import org.apache.tika.parser.ParseContext;
49 import org.apache.tika.parser.Parser;
50 import org.apache.tika.parser.ParsingReader;
51 import org.apache.tika.parser.microsoft.OfficeParserConfig;
52 import org.apache.tika.parser.pdf.PDFParserConfig;
53 import org.openide.util.NbBundle;
54 import org.openide.modules.InstalledFileLocator;
55 import org.openide.util.Lookup;
68 import org.xml.sax.ContentHandler;
69 import org.xml.sax.SAXException;
70 import org.xml.sax.helpers.DefaultHandler;
71 import com.google.common.collect.ImmutableMap;
72 import com.google.common.collect.ImmutableSet;
73 import java.io.InputStreamReader;
74 import java.nio.charset.Charset;
75 import java.util.ArrayList;
76 import java.util.Set;
77 import org.apache.tika.config.TikaConfig;
78 import org.apache.tika.mime.MimeTypes;
79 import org.apache.tika.parser.ocr.TesseractOCRConfig;
80 import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;
83 
88 final class TikaTextExtractor implements TextExtractor {
89 
90  //Mimetype groups to aassist extractor implementations in ignoring binary and
91  //archive files.
92  private static final Set<String> BINARY_MIME_TYPES
93  = ImmutableSet.of(
94  //ignore binary blob data, for which string extraction will be used
95  "application/octet-stream", //NON-NLS
96  "application/x-msdownload"); //NON-NLS
97 
102  private static final Set<String> ARCHIVE_MIME_TYPES
103  = ImmutableSet.of(
104  //ignore unstructured binary and compressed data, for which string extraction or unzipper works better
105  "application/x-7z-compressed", //NON-NLS
106  "application/x-ace-compressed", //NON-NLS
107  "application/x-alz-compressed", //NON-NLS
108  "application/x-arj", //NON-NLS
109  "application/vnd.ms-cab-compressed", //NON-NLS
110  "application/x-cfs-compressed", //NON-NLS
111  "application/x-dgc-compressed", //NON-NLS
112  "application/x-apple-diskimage", //NON-NLS
113  "application/x-gca-compressed", //NON-NLS
114  "application/x-dar", //NON-NLS
115  "application/x-lzx", //NON-NLS
116  "application/x-lzh", //NON-NLS
117  "application/x-rar-compressed", //NON-NLS
118  "application/x-stuffit", //NON-NLS
119  "application/x-stuffitx", //NON-NLS
120  "application/x-gtar", //NON-NLS
121  "application/x-archive", //NON-NLS
122  "application/x-executable", //NON-NLS
123  "application/x-gzip", //NON-NLS
124  "application/zip", //NON-NLS
125  "application/x-zoo", //NON-NLS
126  "application/x-cpio", //NON-NLS
127  "application/x-shar", //NON-NLS
128  "application/x-tar", //NON-NLS
129  "application/x-bzip", //NON-NLS
130  "application/x-bzip2", //NON-NLS
131  "application/x-lzip", //NON-NLS
132  "application/x-lzma", //NON-NLS
133  "application/x-lzop", //NON-NLS
134  "application/x-z", //NON-NLS
135  "application/x-compress"); //NON-NLS
136 
137  // Used to log to the tika file that is why it uses the java.util.logging.logger class instead of the Autopsy one
138  private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS
139  private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());
140 
141  private final ThreadFactory tikaThreadFactory
142  = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();
143  private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);
144  private static final String SQLITE_MIMETYPE = "application/x-sqlite3";
145 
146  private final AutoDetectParser parser = new AutoDetectParser();
147  private final FileTypeDetector fileTypeDetector;
148  private final Content content;
149 
150  private boolean tesseractOCREnabled;
151  private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS
152  private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS
153  private static final File TESSERACT_PATH = locateTesseractExecutable();
154  private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());
155  private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS
156 
157  // documents where OCR is performed
158  private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(
159  "application/pdf",
160  "application/msword",
161  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
162  "application/vnd.ms-powerpoint",
163  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
164  "application/vnd.ms-excel",
165  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
166  );
167 
168  private static final String IMAGE_MIME_TYPE_PREFIX = "image/";
169 
170  private Map<String, String> metadataMap;
171 
172  private ProcessTerminator processTerminator;
173 
174  private static final List<String> TIKA_SUPPORTED_TYPES
175  = new Tika().getParser().getSupportedTypes(new ParseContext())
176  .stream()
177  .map(mt -> mt.getType() + "/" + mt.getSubtype())
178  .collect(Collectors.toList());
179 
180  TikaTextExtractor(Content content) {
181  this.content = content;
182 
183  FileTypeDetector detector = null;
184  try {
185  detector = new FileTypeDetector();
186  } catch (FileTypeDetector.FileTypeDetectorInitException ex) {
187  TIKA_LOGGER.log(Level.SEVERE, "Unable to instantiate a file type detector", ex);
188  }
189  this.fileTypeDetector = detector;
190  }
191 
202  private String getMimeType(AbstractFile file) {
203  String mimeType = MimeTypes.OCTET_STREAM;
204  if (fileTypeDetector != null) {
205  mimeType = fileTypeDetector.getMIMEType(file);
206  } else if (file.getMIMEType() != null) {
207  mimeType = file.getMIMEType();
208  }
209 
210  return mimeType.trim().toLowerCase();
211  }
212 
213  @Override
214  public boolean willUseOCR() {
215  if (!isOcrSupported() || (!(content instanceof AbstractFile))) {
216  return false;
217  }
218 
219  String mimeType = getMimeType((AbstractFile) content);
220  // in order to ocr, it needs to either be an image or a document with embedded content
221  return mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX) || OCR_DOCUMENTS.contains(mimeType);
222  }
223 
229  private boolean isOcrSupported() {
230  // If Tesseract has been installed and is set to be used through
231  // configuration, then ocr is enabled. OCR can only currently be run on 64
232  // bit Windows OS.
233  return TESSERACT_PATH != null
234  && tesseractOCREnabled
235  && PlatformUtil.isWindowsOS()
236  && PlatformUtil.is64BitOS()
237  && isSupported();
238  }
239 
251  @Override
252  public Reader getReader() throws InitReaderException {
253  if (!this.isSupported()) {
254  throw new InitReaderException("Content is not supported");
255  }
256 
257  // Only abstract files are supported, see isSupported()
258  final AbstractFile file = ((AbstractFile) content);
259 
260  String mimeType = getMimeType(file);
261 
262  // Handle images seperately so the OCR task can be cancelled.
263  // See JIRA-4519 for the need to have cancellation in the UI and ingest.
264  if (isOcrSupported() && mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {
265  InputStream imageOcrStream = performOCR(file);
266  return new InputStreamReader(imageOcrStream, Charset.forName("UTF-8"));
267  }
268 
269  // Set up Tika
270  final InputStream stream = new ReadContentInputStream(content);
271 
272  final ParseContext parseContext = new ParseContext();
273  // Documents can contain other documents. By adding
274  // the parser back into the context, Tika will recursively
275  // parse embedded documents.
276  parseContext.set(Parser.class, parser);
277  // Use the more memory efficient Tika SAX parsers for DOCX and
278  // PPTX files (it already uses SAX for XLSX).
279  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
280  officeParserConfig.setUseSAXPptxExtractor(true);
281  officeParserConfig.setUseSAXDocxExtractor(true);
282  parseContext.set(OfficeParserConfig.class, officeParserConfig);
283  if (isOcrSupported()) {
284  // Configure OCR for Tika if it chooses to run OCR
285  // during extraction
286  TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
287  String tesseractFolder = TESSERACT_PATH.getParent();
288  // coming from https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454096#TikaOCR-OverridingDefaultConfiguration
289  ocrConfig.getOtherTesseractConfig().put("tessdataPath", PlatformUtil.getOcrLanguagePacksPath());
290  ocrConfig.getOtherTesseractConfig().put("tesseractPath", tesseractFolder);
291  ocrConfig.setLanguage(languagePacks);
292  parseContext.set(TesseractOCRConfig.class, ocrConfig);
293 
294  // Configure how Tika handles OCRing PDFs
295  PDFParserConfig pdfConfig = new PDFParserConfig();
296 
297  // This stategy tries to pick between OCRing a page in the
298  // PDF and doing text extraction. It makes this choice by
299  // first running text extraction and then counting characters.
300  // If there are too few characters or too many unmapped
301  // unicode characters, it'll run the entire page through OCR
302  // and take that output instead. See JIRA-6938
303  pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);
304  parseContext.set(PDFParserConfig.class, pdfConfig);
305  }
306 
307  Metadata metadata = new Metadata();
308  //Make the creation of a TikaReader a cancellable future in case it takes too long
309  Future<Reader> future = executorService.submit(
310  new GetTikaReader(parser, stream, metadata, parseContext));
311  try {
312  final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);
313  //check if the reader is empty
314  PushbackReader pushbackReader = new PushbackReader(tikaReader);
315  int read = pushbackReader.read();
316  if (read == -1) {
317  throw new InitReaderException("Unable to extract text: "
318  + "Tika returned empty reader for " + content);
319  }
320  pushbackReader.unread(read);
321 
322  //Save the metadata if it has not been fetched already.
323  if (metadataMap == null) {
324  metadataMap = new HashMap<>();
325  for (String mtdtKey : metadata.names()) {
326  metadataMap.put(mtdtKey, metadata.get(mtdtKey));
327  }
328  }
329 
330  return new ReaderCharSource(pushbackReader).openStream();
331  } catch (TimeoutException te) {
332  final String msg = NbBundle.getMessage(this.getClass(),
333  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
334  content.getId(), content.getName());
335  throw new InitReaderException(msg, te);
336  } catch (InitReaderException ex) {
337  throw ex;
338  } catch (Exception ex) {
339  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error with file [id=%d] %s, see Tika log for details...",
340  content.getId(), content.getName()));
341  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to Tika parse the "
342  + "content" + content.getId() + ": " + content.getName(),
343  ex.getCause()); //NON-NLS
344  final String msg = NbBundle.getMessage(this.getClass(),
345  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
346  content.getId(), content.getName());
347  throw new InitReaderException(msg, ex);
348  } finally {
349  future.cancel(true);
350  }
351  }
352 
363  private InputStream performOCR(AbstractFile file) throws InitReaderException {
364  File inputFile = null;
365  File outputFile = null;
366  try {
367  String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();
368 
369  //Appending file id makes the name unique
370  String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());
371  inputFile = Paths.get(tempDirectory, tempFileName).toFile();
372  ContentUtils.writeToFile(content, inputFile);
373 
374  String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);
375  String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();
376  String executeablePath = TESSERACT_PATH.toString();
377 
378  //Build tesseract commands
379  ProcessBuilder process = new ProcessBuilder();
380  process.command(executeablePath,
381  String.format("\"%s\"", inputFile.getAbsolutePath()),
382  String.format("\"%s\"", outputFilePath),
383  "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),
384  //language pack command flag
385  "-l", languagePacks);
386 
387  //If the ProcessTerminator was supplied during
388  //configuration apply it here.
389  if (processTerminator != null) {
390  ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);
391  } else {
392  ExecUtil.execute(process);
393  }
394 
395  outputFile = new File(outputFilePath + ".txt");
396  //Open a stream of the Tesseract text file and send this to Tika
397  return new CleanUpStream(outputFile);
398  } catch (NoCurrentCaseException | IOException ex) {
399  if (outputFile != null) {
400  outputFile.delete();
401  }
402  throw new InitReaderException("Could not successfully run Tesseract", ex);
403  } finally {
404  if (inputFile != null) {
405  inputFile.delete();
406  }
407  }
408  }
409 
414  private class GetTikaReader implements Callable<Reader> {
415 
416  private final AutoDetectParser parser;
417  private final InputStream stream;
418  private final Metadata metadata;
419  private final ParseContext parseContext;
420 
421  GetTikaReader(AutoDetectParser parser, InputStream stream,
422  Metadata metadata, ParseContext parseContext) {
423  this.parser = parser;
424  this.stream = stream;
425  this.metadata = metadata;
426  this.parseContext = parseContext;
427  }
428 
429  @Override
430  public Reader call() throws Exception {
431  return new ParsingReader(parser, stream, metadata, parseContext);
432  }
433  }
434 
440  private class CleanUpStream extends FileInputStream {
441 
442  private File file;
443 
451  CleanUpStream(File file) throws FileNotFoundException {
452  super(file);
453  this.file = file;
454  }
455 
461  @Override
462  public void close() throws IOException {
463  try {
464  super.close();
465  } finally {
466  if (file != null) {
467  file.delete();
468  file = null;
469  }
470  }
471  }
472  }
473 
479  private static File locateTesseractExecutable() {
480  if (!PlatformUtil.isWindowsOS()) {
481  return null;
482  }
483 
484  String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();
485  File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);
486  if (null == exeFile) {
487  return null;
488  }
489 
490  if (!exeFile.canExecute()) {
491  return null;
492  }
493 
494  return exeFile;
495  }
496 
502  @Override
503  public Map<String, String> getMetadata() {
504  if (metadataMap != null) {
505  return ImmutableMap.copyOf(metadataMap);
506  }
507 
508  try {
509  metadataMap = new HashMap<>();
510  InputStream stream = new ReadContentInputStream(content);
511  ContentHandler doNothingContentHandler = new DefaultHandler();
512  Metadata mtdt = new Metadata();
513  parser.parse(stream, doNothingContentHandler, mtdt);
514  for (String mtdtKey : mtdt.names()) {
515  metadataMap.put(mtdtKey, mtdt.get(mtdtKey));
516  }
517  } catch (IOException | SAXException | TikaException ex) {
518  AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error getting metadata for file [id=%d] %s, see Tika log for details...", //NON-NLS
519  content.getId(), content.getName()));
520  TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to get metadata for " //NON-NLS
521  + "content" + content.getId() + ": " + content.getName(), ex); //NON-NLS
522  }
523 
524  return metadataMap;
525  }
526 
532  @Override
533  public boolean isSupported() {
534  if (!(content instanceof AbstractFile)) {
535  return false;
536  }
537 
538  String detectedType = ((AbstractFile) content).getMIMEType();
539  if (detectedType == null
540  || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)
541  || ARCHIVE_MIME_TYPES.contains(detectedType)
542  || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS
543  || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS
544  ) {
545  return false;
546  }
547 
548  return TIKA_SUPPORTED_TYPES.contains(detectedType);
549  }
550 
556  private static String formatLanguagePacks(List<String> languagePacks) {
557  return String.join("+", languagePacks);
558  }
559 
567  private static int getTimeout(long size) {
568  if (size < 1024 * 1024L) //1MB
569  {
570  return 60;
571  } else if (size < 10 * 1024 * 1024L) //10MB
572  {
573  return 1200;
574  } else if (size < 100 * 1024 * 1024L) //100MB
575  {
576  return 3600;
577  } else {
578  return 3 * 3600;
579  }
580 
581  }
582 
592  @Override
593  public void setExtractionSettings(Lookup context) {
594  if (context != null) {
595  List<ProcessTerminator> terminators = new ArrayList<>();
596  ImageConfig configInstance = context.lookup(ImageConfig.class);
597  if (configInstance != null) {
598  this.tesseractOCREnabled = configInstance.getOCREnabled();
599 
600  if (Objects.nonNull(configInstance.getOCRLanguages())) {
601  this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());
602  }
603 
604  terminators.add(configInstance.getOCRTimeoutTerminator());
605  }
606 
607  ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);
608  if (terminatorInstance != null) {
609  terminators.add(terminatorInstance);
610  }
611 
612  if (!terminators.isEmpty()) {
613  this.processTerminator = new HybridTerminator(terminators);
614  }
615  }
616  }
617 
622  private static class ReaderCharSource extends CharSource {
623 
624  private final Reader reader;
625 
626  ReaderCharSource(Reader reader) {
627  this.reader = reader;
628  }
629 
630  @Override
631  public Reader openStream() throws IOException {
632  return reader;
633  }
634  }
635 }

Copyright © 2012-2024 Sleuth Kit Labs. Generated on: Mon Feb 17 2025
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.