19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import com.google.common.io.CharSource;
 
   22 import java.io.IOException;
 
   23 import java.io.PushbackReader;
 
   24 import java.io.Reader;
 
   25 import java.util.List;
 
   26 import java.util.concurrent.ExecutorService;
 
   27 import java.util.concurrent.Executors;
 
   28 import java.util.concurrent.Future;
 
   29 import java.util.concurrent.TimeUnit;
 
   30 import java.util.concurrent.TimeoutException;
 
   31 import java.util.logging.Level;
 
   32 import java.util.stream.Collectors;
 
   33 import java.util.stream.Stream;
 
   34 import org.apache.tika.Tika;
 
   35 import org.apache.tika.metadata.Metadata;
 
   36 import org.apache.tika.parser.ParseContext;
 
   37 import org.openide.util.NbBundle;
 
   46 class TikaTextExtractor 
extends FileTextExtractor {
 
   48     static final private Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
 
   49     private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
 
   51     private static final List<String> TIKA_SUPPORTED_TYPES
 
   52             = 
new Tika().getParser().getSupportedTypes(
new ParseContext())
 
   54                     .map(mt -> mt.getType() + 
"/" + mt.getSubtype())
 
   55                     .collect(Collectors.toList());
 
   58     public void logWarning(
final String msg, Exception ex) {
 
   59         KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
 
   60         logger.log(Level.WARNING, msg, ex); 
 
   64     public Reader getReader(AbstractFile sourceFile) 
throws TextExtractorException {
 
   65         ReadContentInputStream stream = 
new ReadContentInputStream(sourceFile);
 
   67         Metadata metadata = 
new Metadata();
 
   69         final Future<Reader> future = tikaParseExecutor.submit(() -> 
new Tika().parse(stream, metadata));
 
   71             final Reader tikaReader = future.get(getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
 
   74             PushbackReader pushbackReader = 
new PushbackReader(tikaReader);
 
   75             int read = pushbackReader.read();
 
   77                 throw new TextExtractorException(
"Unable to extract text: Tika returned empty reader for " + sourceFile);
 
   79             pushbackReader.unread(read);
 
   82             CharSource metaDataCharSource = getMetaDataCharSource(metadata);
 
   83             return CharSource.concat(
new ReaderCharSource(pushbackReader), metaDataCharSource).openStream();
 
   84         } 
catch (TimeoutException te) {
 
   85             final String msg = NbBundle.getMessage(this.getClass(), 
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName());
 
   87             throw new TextExtractorException(msg, te);
 
   88         } 
catch (TextExtractorException ex) {
 
   90         } 
catch (Exception ex) {
 
   91             KeywordSearch.getTikaLogger().log(Level.WARNING, 
"Exception: Unable to Tika parse the content" + sourceFile.getId() + 
": " + sourceFile.getName(), ex.getCause()); 
 
   92             final String msg = NbBundle.getMessage(this.getClass(), 
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName());
 
   94             throw new TextExtractorException(msg, ex);
 
  108     static private CharSource getMetaDataCharSource(Metadata metadata) {
 
  109         return CharSource.wrap(
 
  110                 new StringBuilder(
"\n\n------------------------------METADATA------------------------------\n\n")
 
  111                         .append(Stream.of(metadata.names()).sorted()
 
  112                                 .map(key -> key + 
": " + metadata.get(key))
 
  113                                 .collect(Collectors.joining(
"\n"))
 
  118     public boolean isContentTypeSpecific() {
 
  123     public boolean isSupported(AbstractFile file, String detectedFormat) {
 
  124         if (detectedFormat == null
 
  125                 || FileTextExtractor.BLOB_MIME_TYPES.contains(detectedFormat) 
 
  126                 || FileTextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)
 
  127                 || (detectedFormat.startsWith(
"video/") && !detectedFormat.equals(
"video/x-flv")) 
 
  131         return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
 
  135     public boolean isDisabled() {
 
  146     private static int getTimeout(
long size) {
 
  147         if (size < 1024 * 1024L) 
 
  150         } 
else if (size < 10 * 1024 * 1024L) 
 
  153         } 
else if (size < 100 * 1024 * 1024L)