19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 
   21 import java.io.FileOutputStream;
 
   22 import java.io.IOException;
 
   23 import java.io.InputStream;
 
   24 import java.nio.file.Files;
 
   25 import java.nio.file.Path;
 
   26 import java.util.HashMap;
 
   28 import java.util.logging.Level;
 
   29 import org.apache.commons.io.FilenameUtils;
 
   30 import org.apache.commons.io.IOUtils;
 
   31 import org.apache.tika.exception.TikaException;
 
   32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 
   33 import org.apache.tika.metadata.Metadata;
 
   34 import org.apache.tika.parser.AutoDetectParser;
 
   35 import org.apache.tika.parser.ParseContext;
 
   36 import org.apache.tika.parser.Parser;
 
   37 import org.apache.tika.parser.pdf.PDFParserConfig;
 
   38 import org.apache.tika.sax.BodyContentHandler;
 
   39 import org.xml.sax.ContentHandler;
 
   40 import org.xml.sax.SAXException;
 
   51 final class PDFAttachmentExtractor {
 
   53     private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
 
   54     private final AutoDetectParser parser;
 
   56     public PDFAttachmentExtractor() {
 
   57         parser = 
new AutoDetectParser();
 
   60     public PDFAttachmentExtractor(AutoDetectParser parser) {
 
   76     public Map<String, NewResourceData> extract(InputStream input, 
long parentID, Path outputDir) 
throws IOException, SAXException, TikaException {
 
   77         ExtractionPreconditions.checkArgument(Files.exists(outputDir), 
 
   78                 String.format(
"Output directory: %s, does not exist.", outputDir.toString())); 
 
   80         ParseContext parseContext = 
new ParseContext();
 
   81         parseContext.set(Parser.class, parser);
 
   83         PDFParserConfig pdfConfig = 
new PDFParserConfig();
 
   84         pdfConfig.setExtractInlineImages(
true);
 
   85         pdfConfig.setExtractUniqueInlineImagesOnly(
true);
 
   87         parseContext.set(PDFParserConfig.class, pdfConfig);
 
   90         NewResourceWatcher watcher = 
new NewResourceWatcher();
 
   91         parseContext.set(EmbeddedDocumentExtractor.class, 
new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
 
   94         parser.parse(input, 
new BodyContentHandler(-1), 
new Metadata(), parseContext);
 
   96         return watcher.getSnapshot();
 
  103     static class EmbeddedAttachmentHandler 
implements EmbeddedDocumentExtractor {
 
  105         private final Path outputDirectory;
 
  106         private final NewResourceWatcher watcher;
 
  107         private final Long parentID;
 
  108         private Integer attachmentCount;
 
  110         public EmbeddedAttachmentHandler(Path outputDirectory, 
long parentID, NewResourceWatcher watcher) {
 
  111             this.outputDirectory = outputDirectory;
 
  112             this.watcher = watcher;
 
  113             this.parentID = parentID;
 
  118         public boolean shouldParseEmbedded(Metadata mtdt) {
 
  124         public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, 
boolean bln) 
throws SAXException, IOException {
 
  126             String uniqueExtractedName = 
"extract_" + attachmentCount++; 
 
  128             String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
 
  129             String ext = FilenameUtils.getExtension(name);
 
  133                 name = uniqueExtractedName;
 
  134             } 
else if(!ext.isEmpty()) {
 
  135                 uniqueExtractedName += 
"." + ext;
 
  138             Path outputFile = outputDirectory.resolve(uniqueExtractedName);
 
  140             try (EncodedFileOutputStream outputStream = 
new EncodedFileOutputStream(
 
  141                     new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
 
  142                 int bytesCopied = IOUtils.copy(in, outputStream);
 
  143                 watcher.notify(name, outputFile, bytesCopied);
 
  144             } 
catch (IOException ex) {
 
  145                 logger.log(Level.WARNING, String.format(
"Could not extract attachment %s into directory %s", 
 
  146                         uniqueExtractedName, outputFile), ex);
 
  156     static class NewResourceData {
 
  157         private final Path path;
 
  158         private final int length;
 
  160         NewResourceData(Path path, 
int length) {
 
  162             this.length = length;
 
  181     static class NewResourceWatcher {
 
  183         private final Map<String, NewResourceData> newResourcePaths;
 
  185         public NewResourceWatcher() {
 
  186             newResourcePaths = 
new HashMap<>();
 
  189         public void notify(String name, Path localPath, 
int length) {
 
  190             newResourcePaths.put(name, 
new NewResourceData(localPath, length));
 
  193         public Map<String, NewResourceData> getSnapshot() {
 
  194             return newResourcePaths;
 
  202     static class ExtractionPreconditions {
 
  204         public static void checkArgument(
boolean expression, String msg) 
throws IOException {
 
  206                 throw new IOException(msg);
 
  210         private ExtractionPreconditions(){