api-docs/4.19.1/_tika_text_extractor_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2018-2021 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.textextractors;


 import com.google.common.io.CharSource;

 import com.google.common.util.concurrent.ThreadFactoryBuilder;

 import java.io.File;

 import java.io.FileInputStream;

 import java.io.FileNotFoundException;

 import java.io.IOException;

 import java.io.InputStream;

 import java.io.PushbackReader;

 import java.io.Reader;

 import java.nio.file.Paths;

 import java.util.HashMap;

 import java.util.List;

 import java.util.Objects;

 import java.util.Map;

 import java.util.concurrent.Callable;

 import java.util.concurrent.ExecutorService;

 import java.util.concurrent.Executors;

 import java.util.concurrent.Future;

 import java.util.concurrent.ThreadFactory;

 import java.util.concurrent.TimeUnit;

 import java.util.concurrent.TimeoutException;

 import java.util.logging.Level;

 import java.util.stream.Collectors;

 import org.apache.tika.Tika;

 import org.apache.tika.exception.TikaException;

 import org.apache.tika.metadata.Metadata;

 import org.apache.tika.parser.AutoDetectParser;

 import org.apache.tika.parser.ParseContext;

 import org.apache.tika.parser.Parser;

 import org.apache.tika.parser.ParsingReader;

 import org.apache.tika.parser.microsoft.OfficeParserConfig;

 import org.apache.tika.parser.ocr.TesseractOCRConfig;

 import org.apache.tika.parser.pdf.PDFParserConfig;

 import org.openide.util.NbBundle;

 import org.openide.modules.InstalledFileLocator;

 import org.openide.util.Lookup;

 import org.sleuthkit.autopsy.casemodule.Case;

 import org.sleuthkit.autopsy.casemodule.NoCurrentCaseException;

 import org.sleuthkit.autopsy.coreutils.ExecUtil;

 import org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator;

 import org.sleuthkit.autopsy.coreutils.FileUtil;

 import org.sleuthkit.autopsy.coreutils.Logger;

 import org.sleuthkit.autopsy.coreutils.PlatformUtil;

 import org.sleuthkit.autopsy.textextractors.configs.ImageConfig;

 import org.sleuthkit.autopsy.datamodel.ContentUtils;

 import org.sleuthkit.datamodel.AbstractFile;

 import org.sleuthkit.datamodel.Content;

 import org.sleuthkit.datamodel.ReadContentInputStream;

 import org.xml.sax.ContentHandler;

 import org.xml.sax.SAXException;

 import org.xml.sax.helpers.DefaultHandler;

 import com.google.common.collect.ImmutableMap;

 import com.google.common.collect.ImmutableSet;

 import java.io.InputStreamReader;

 import java.nio.charset.Charset;

 import java.util.ArrayList;

 import java.util.Set;

 import org.apache.tika.mime.MimeTypes;

 import org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY;

 import org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator;

 import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;


 final class TikaTextExtractor implements TextExtractor {


     //Mimetype groups to aassist extractor implementations in ignoring binary and

     //archive files.

     private static final Set<String> BINARY_MIME_TYPES

             = ImmutableSet.of(

                     //ignore binary blob data, for which string extraction will be used

                     "application/octet-stream", //NON-NLS

                     "application/x-msdownload"); //NON-NLS


     private static final Set<String> ARCHIVE_MIME_TYPES

             = ImmutableSet.of(

                     //ignore unstructured binary and compressed data, for which string extraction or unzipper works better

                     "application/x-7z-compressed", //NON-NLS

                     "application/x-ace-compressed", //NON-NLS

                     "application/x-alz-compressed", //NON-NLS

                     "application/x-arj", //NON-NLS

                     "application/vnd.ms-cab-compressed", //NON-NLS

                     "application/x-cfs-compressed", //NON-NLS

                     "application/x-dgc-compressed", //NON-NLS

                     "application/x-apple-diskimage", //NON-NLS

                     "application/x-gca-compressed", //NON-NLS

                     "application/x-dar", //NON-NLS

                     "application/x-lzx", //NON-NLS

                     "application/x-lzh", //NON-NLS

                     "application/x-rar-compressed", //NON-NLS

                     "application/x-stuffit", //NON-NLS

                     "application/x-stuffitx", //NON-NLS

                     "application/x-gtar", //NON-NLS

                     "application/x-archive", //NON-NLS

                     "application/x-executable", //NON-NLS

                     "application/x-gzip", //NON-NLS

                     "application/zip", //NON-NLS

                     "application/x-zoo", //NON-NLS

                     "application/x-cpio", //NON-NLS

                     "application/x-shar", //NON-NLS

                     "application/x-tar", //NON-NLS

                     "application/x-bzip", //NON-NLS

                     "application/x-bzip2", //NON-NLS

                     "application/x-lzip", //NON-NLS

                     "application/x-lzma", //NON-NLS

                     "application/x-lzop", //NON-NLS

                     "application/x-z", //NON-NLS

                     "application/x-compress"); //NON-NLS


     // Used to log to the tika file that is why it uses the java.util.logging.logger class instead of the Autopsy one

     private static final java.util.logging.Logger TIKA_LOGGER = java.util.logging.Logger.getLogger("Tika"); //NON-NLS

     private static final Logger AUTOPSY_LOGGER = Logger.getLogger(TikaTextExtractor.class.getName());


     private final ThreadFactory tikaThreadFactory

             = new ThreadFactoryBuilder().setNameFormat("tika-reader-%d").build();

     private final ExecutorService executorService = Executors.newSingleThreadExecutor(tikaThreadFactory);

     private static final String SQLITE_MIMETYPE = "application/x-sqlite3";


     private final AutoDetectParser parser = new AutoDetectParser();

     private final FileTypeDetector fileTypeDetector;

     private final Content content;


     private boolean tesseractOCREnabled;

     private static final String TESSERACT_DIR_NAME = "Tesseract-OCR"; //NON-NLS

     private static final String TESSERACT_EXECUTABLE = "tesseract.exe"; //NON-NLS

     private static final File TESSERACT_PATH = locateTesseractExecutable();

     private String languagePacks = formatLanguagePacks(PlatformUtil.getOcrLanguagePacks());

     private static final String TESSERACT_OUTPUT_FILE_NAME = "tess_output"; //NON-NLS


     // documents where OCR is performed

     private static final ImmutableSet<String> OCR_DOCUMENTS = ImmutableSet.of(

             "application/pdf",

             "application/msword",

             "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

             "application/vnd.ms-powerpoint",

             "application/vnd.openxmlformats-officedocument.presentationml.presentation",

             "application/vnd.ms-excel",

             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

     );


     private static final String IMAGE_MIME_TYPE_PREFIX = "image/";


     private Map<String, String> metadataMap;


     private ProcessTerminator processTerminator;


     private static final List<String> TIKA_SUPPORTED_TYPES

             = new Tika().getParser().getSupportedTypes(new ParseContext())

                     .stream()

                     .map(mt -> mt.getType() + "/" + mt.getSubtype())

                     .collect(Collectors.toList());


     TikaTextExtractor(Content content) {

         this.content = content;


         FileTypeDetector detector = null;

         try {

             detector = new FileTypeDetector();

         } catch (FileTypeDetector.FileTypeDetectorInitException ex) {

             TIKA_LOGGER.log(Level.SEVERE, "Unable to instantiate a file type detector", ex);

         }

         this.fileTypeDetector = detector;

     }


     private String getMimeType(AbstractFile file) {

         String mimeType = MimeTypes.OCTET_STREAM;

         if (fileTypeDetector != null) {

             mimeType = fileTypeDetector.getMIMEType(file);

         } else if (file.getMIMEType() != null) {

             mimeType = file.getMIMEType();

         }


         return mimeType.trim().toLowerCase();

     }


     @Override

     public boolean willUseOCR() {

         if (!isOcrSupported() || (!(content instanceof AbstractFile))) {

             return false;

         }


         String mimeType = getMimeType((AbstractFile) content);

         // in order to ocr, it needs to either be an image or a document with embedded content

         return mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX) || OCR_DOCUMENTS.contains(mimeType);

     }


     private boolean isOcrSupported() {

         // If Tesseract has been installed and is set to be used through

         // configuration, then ocr is enabled. OCR can only currently be run on 64

         // bit Windows OS.

         return TESSERACT_PATH != null

                 && tesseractOCREnabled

                 && PlatformUtil.isWindowsOS()

                 && PlatformUtil.is64BitOS()

                 && isSupported();

     }


     @Override

     public Reader getReader() throws InitReaderException {

         if (!this.isSupported()) {

             throw new InitReaderException("Content is not supported");

         }


         // Only abstract files are supported, see isSupported()

         final AbstractFile file = ((AbstractFile) content);


         String mimeType = getMimeType(file);


         // Handle images seperately so the OCR task can be cancelled.

         // See JIRA-4519 for the need to have cancellation in the UI and ingest.

         if (isOcrSupported() && mimeType.startsWith(IMAGE_MIME_TYPE_PREFIX)) {

             InputStream imageOcrStream = performOCR(file);

             return new InputStreamReader(imageOcrStream, Charset.forName("UTF-8"));

         }


         // Set up Tika

         final InputStream stream = new ReadContentInputStream(content);


         final ParseContext parseContext = new ParseContext();

         // Documents can contain other documents. By adding

         // the parser back into the context, Tika will recursively

         // parse embedded documents.

         parseContext.set(Parser.class, parser);

         // Use the more memory efficient Tika SAX parsers for DOCX and

         // PPTX files (it already uses SAX for XLSX).

         OfficeParserConfig officeParserConfig = new OfficeParserConfig();

         officeParserConfig.setUseSAXPptxExtractor(true);

         officeParserConfig.setUseSAXDocxExtractor(true);

         parseContext.set(OfficeParserConfig.class, officeParserConfig);

         if (isOcrSupported()) {

             // Configure OCR for Tika if it chooses to run OCR

             // during extraction

             TesseractOCRConfig ocrConfig = new TesseractOCRConfig();

             String tesseractFolder = TESSERACT_PATH.getParent();

             ocrConfig.setTesseractPath(tesseractFolder);

             ocrConfig.setLanguage(languagePacks);

             ocrConfig.setTessdataPath(PlatformUtil.getOcrLanguagePacksPath());

             parseContext.set(TesseractOCRConfig.class, ocrConfig);


             // Configure how Tika handles OCRing PDFs

             PDFParserConfig pdfConfig = new PDFParserConfig();


             // This stategy tries to pick between OCRing a page in the

             // PDF and doing text extraction. It makes this choice by

             // first running text extraction and then counting characters.

             // If there are too few characters or too many unmapped

             // unicode characters, it'll run the entire page through OCR

             // and take that output instead. See JIRA-6938

             pdfConfig.setOcrStrategy(OCR_STRATEGY.AUTO);

             parseContext.set(PDFParserConfig.class, pdfConfig);

         }


         Metadata metadata = new Metadata();

         //Make the creation of a TikaReader a cancellable future in case it takes too long

         Future<Reader> future = executorService.submit(

                 new GetTikaReader(parser, stream, metadata, parseContext));

         try {

             final Reader tikaReader = future.get(getTimeout(content.getSize()), TimeUnit.SECONDS);

             //check if the reader is empty

             PushbackReader pushbackReader = new PushbackReader(tikaReader);

             int read = pushbackReader.read();

             if (read == -1) {

                 throw new InitReaderException("Unable to extract text: "

                         + "Tika returned empty reader for " + content);

             }

             pushbackReader.unread(read);


             //Save the metadata if it has not been fetched already.

             if (metadataMap == null) {

                 metadataMap = new HashMap<>();

                 for (String mtdtKey : metadata.names()) {

                     metadataMap.put(mtdtKey, metadata.get(mtdtKey));

                 }

             }


             return new ReaderCharSource(pushbackReader).openStream();

         } catch (TimeoutException te) {

             final String msg = NbBundle.getMessage(this.getClass(),

                     "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",

                     content.getId(), content.getName());

             throw new InitReaderException(msg, te);

         } catch (InitReaderException ex) {

             throw ex;

         } catch (Exception ex) {

             AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error with file [id=%d] %s, see Tika log for details...",

                     content.getId(), content.getName()));

             TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to Tika parse the "

                     + "content" + content.getId() + ": " + content.getName(),

                     ex.getCause()); //NON-NLS

             final String msg = NbBundle.getMessage(this.getClass(),

                     "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",

                     content.getId(), content.getName());

             throw new InitReaderException(msg, ex);

         } finally {

             future.cancel(true);

         }

     }


     private InputStream performOCR(AbstractFile file) throws InitReaderException {

         File inputFile = null;

         File outputFile = null;

         try {

             String tempDirectory = Case.getCurrentCaseThrows().getTempDirectory();


             //Appending file id makes the name unique

             String tempFileName = FileUtil.escapeFileName(file.getId() + file.getName());

             inputFile = Paths.get(tempDirectory, tempFileName).toFile();

             ContentUtils.writeToFile(content, inputFile);


             String tempOutputName = FileUtil.escapeFileName(file.getId() + TESSERACT_OUTPUT_FILE_NAME);

             String outputFilePath = Paths.get(tempDirectory, tempOutputName).toString();

             String executeablePath = TESSERACT_PATH.toString();


             //Build tesseract commands

             ProcessBuilder process = new ProcessBuilder();

             process.command(executeablePath,

                     String.format("\"%s\"", inputFile.getAbsolutePath()),

                     String.format("\"%s\"", outputFilePath),

                     "--tessdata-dir", PlatformUtil.getOcrLanguagePacksPath(),

                     //language pack command flag

                     "-l", languagePacks);


             //If the ProcessTerminator was supplied during

             //configuration apply it here.

             if (processTerminator != null) {

                 ExecUtil.execute(process, 1, TimeUnit.SECONDS, processTerminator);

             } else {

                 ExecUtil.execute(process);

             }


             outputFile = new File(outputFilePath + ".txt");

             //Open a stream of the Tesseract text file and send this to Tika

             return new CleanUpStream(outputFile);

         } catch (NoCurrentCaseException | IOException ex) {

             if (outputFile != null) {

                 outputFile.delete();

             }

             throw new InitReaderException("Could not successfully run Tesseract", ex);

         } finally {

             if (inputFile != null) {

                 inputFile.delete();

             }

         }

     }


     private class GetTikaReader implements Callable<Reader> {


         private final AutoDetectParser parser;

         private final InputStream stream;

         private final Metadata metadata;

         private final ParseContext parseContext;


         GetTikaReader(AutoDetectParser parser, InputStream stream,

                 Metadata metadata, ParseContext parseContext) {

             this.parser = parser;

             this.stream = stream;

             this.metadata = metadata;

             this.parseContext = parseContext;

         }


         @Override

         public Reader call() throws Exception {

             return new ParsingReader(parser, stream, metadata, parseContext);

         }

     }


     private class CleanUpStream extends FileInputStream {


         private File file;


         CleanUpStream(File file) throws FileNotFoundException {

             super(file);

             this.file = file;

         }


         @Override

         public void close() throws IOException {

             try {

                 super.close();

             } finally {

                 if (file != null) {

                     file.delete();

                     file = null;

                 }

             }

         }

     }


     private static File locateTesseractExecutable() {

         if (!PlatformUtil.isWindowsOS()) {

             return null;

         }


         String executableToFindName = Paths.get(TESSERACT_DIR_NAME, TESSERACT_EXECUTABLE).toString();

         File exeFile = InstalledFileLocator.getDefault().locate(executableToFindName, TikaTextExtractor.class.getPackage().getName(), false);

         if (null == exeFile) {

             return null;

         }


         if (!exeFile.canExecute()) {

             return null;

         }


         return exeFile;

     }


     @Override

     public Map<String, String> getMetadata() {

         if (metadataMap != null) {

             return ImmutableMap.copyOf(metadataMap);

         }


         try {

             metadataMap = new HashMap<>();

             InputStream stream = new ReadContentInputStream(content);

             ContentHandler doNothingContentHandler = new DefaultHandler();

             Metadata mtdt = new Metadata();

             parser.parse(stream, doNothingContentHandler, mtdt);

             for (String mtdtKey : mtdt.names()) {

                 metadataMap.put(mtdtKey, mtdt.get(mtdtKey));

             }

         } catch (IOException | SAXException | TikaException ex) {

             AUTOPSY_LOGGER.log(Level.WARNING, String.format("Error getting metadata for file [id=%d] %s, see Tika log for details...", //NON-NLS

                     content.getId(), content.getName()));

             TIKA_LOGGER.log(Level.WARNING, "Exception: Unable to get metadata for " //NON-NLS

                     + "content" + content.getId() + ": " + content.getName(), ex); //NON-NLS

         }


         return metadataMap;

     }


     @Override

     public boolean isSupported() {

         if (!(content instanceof AbstractFile)) {

             return false;

         }


         String detectedType = ((AbstractFile) content).getMIMEType();

         if (detectedType == null

                 || BINARY_MIME_TYPES.contains(detectedType) //any binary unstructured blobs (string extraction will be used)

                 || ARCHIVE_MIME_TYPES.contains(detectedType)

                 || (detectedType.startsWith("video/") && !detectedType.equals("video/x-flv")) //skip video other than flv (tika supports flv only) //NON-NLS

                 || detectedType.equals(SQLITE_MIMETYPE) //Skip sqlite files, Tika cannot handle virtual tables and will fail with an exception. //NON-NLS

                 ) {

             return false;

         }


         return TIKA_SUPPORTED_TYPES.contains(detectedType);

     }


     private static String formatLanguagePacks(List<String> languagePacks) {

         return String.join("+", languagePacks);

     }


     private static int getTimeout(long size) {

         if (size < 1024 * 1024L) //1MB

         {

             return 60;

         } else if (size < 10 * 1024 * 1024L) //10MB

         {

             return 1200;

         } else if (size < 100 * 1024 * 1024L) //100MB

         {

             return 3600;

         } else {

             return 3 * 3600;

         }


     }


     @Override

     public void setExtractionSettings(Lookup context) {

         if (context != null) {

             List<ProcessTerminator> terminators = new ArrayList<>();

             ImageConfig configInstance = context.lookup(ImageConfig.class);

             if (configInstance != null) {

                 this.tesseractOCREnabled = configInstance.getOCREnabled();


                 if (Objects.nonNull(configInstance.getOCRLanguages())) {

                     this.languagePacks = formatLanguagePacks(configInstance.getOCRLanguages());

                 }


                 terminators.add(configInstance.getOCRTimeoutTerminator());

             }


             ProcessTerminator terminatorInstance = context.lookup(ProcessTerminator.class);

             if (terminatorInstance != null) {

                 terminators.add(terminatorInstance);

             }


             if (!terminators.isEmpty()) {

                 this.processTerminator = new HybridTerminator(terminators);

             }

         }

     }


     private static class ReaderCharSource extends CharSource {


         private final Reader reader;


         ReaderCharSource(Reader reader) {

             this.reader = reader;

         }


         @Override

         public Reader openStream() throws IOException {

             return reader;

         }

     }

 }

org.sleuthkit

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.GetTikaReader.call
Reader call()
Definition: TikaTextExtractor.java:428

org.sleuthkit.autopsy.casemodule.Case
Definition: Case.java:170

org.sleuthkit.autopsy.coreutils.FileUtil
Definition: FileUtil.java:31

org.sleuthkit.autopsy.datamodel.ContentUtils
Definition: ContentUtils.java:52

org

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.CleanUpStream.close
void close()
Definition: TikaTextExtractor.java:460

org.sleuthkit.autopsy.casemodule
Definition: AddImageAction.java:19

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.CleanUpStream
Definition: TikaTextExtractor.java:438

org.sleuthkit.autopsy.datamodel
Definition: AbstractAbstractFileNode.java:19

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.GetTikaReader.parser
final AutoDetectParser parser
Definition: TikaTextExtractor.java:414

org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector
Definition: FileTypeDetector.java:46

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.ReaderCharSource.reader
final Reader reader
Definition: TikaTextExtractor.java:622

org.sleuthkit.autopsy.coreutils
Definition: AppSQLiteDB.java:19

org.sleuthkit.autopsy.coreutils.Logger
Definition: Logger.java:36

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.GetTikaReader.metadata
final Metadata metadata
Definition: TikaTextExtractor.java:416

org.sleuthkit.autopsy.coreutils.PlatformUtil
Definition: PlatformUtil.java:52

org.sleuthkit.autopsy.coreutils.ExecUtil
Definition: ExecUtil.java:37

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.ReaderCharSource
Definition: TikaTextExtractor.java:620

org.sleuthkit.autopsy.coreutils.ExecUtil.ProcessTerminator
Definition: ExecUtil.java:54

org.sleuthkit.autopsy.coreutils.ExecUtil.HybridTerminator
Definition: ExecUtil.java:127

org.sleuthkit.autopsy.textextractors.configs.ImageConfig
Definition: ImageConfig.java:33

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.CleanUpStream.file
File file
Definition: TikaTextExtractor.java:440

org.sleuthkit.autopsy.modules

org.sleuthkit.autopsy.coreutils.PlatformUtil.isWindowsOS
static boolean isWindowsOS()
Definition: PlatformUtil.java:325

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.GetTikaReader
Definition: TikaTextExtractor.java:412

org.sleuthkit.autopsy

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.GetTikaReader.parseContext
final ParseContext parseContext
Definition: TikaTextExtractor.java:417

org.sleuthkit.autopsy.casemodule.NoCurrentCaseException
Definition: NoCurrentCaseException.java:26

org.sleuthkit.autopsy.textextractors
Definition: ArtifactTextExtractor.java:19

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.ReaderCharSource.openStream
Reader openStream()
Definition: TikaTextExtractor.java:629

org.sleuthkit.autopsy.textextractors.configs
Definition: ImageConfig.java:19

org.sleuthkit.autopsy.textextractors.TikaTextExtractor.GetTikaReader.stream
final InputStream stream
Definition: TikaTextExtractor.java:415

org.sleuthkit.autopsy.modules.filetypeid
Definition: AddFileTypeDialog.java:19