19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 
   21 import java.io.FileOutputStream;
 
   22 import java.io.IOException;
 
   23 import java.io.InputStream;
 
   24 import java.nio.file.Files;
 
   25 import java.nio.file.Path;
 
   26 import java.util.HashMap;
 
   28 import java.util.logging.Level;
 
   29 import org.apache.commons.io.FilenameUtils;
 
   30 import org.apache.commons.io.IOUtils;
 
   31 import org.apache.tika.exception.TikaException;
 
   32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 
   33 import org.apache.tika.metadata.Metadata;
 
   34 import org.apache.tika.parser.AutoDetectParser;
 
   35 import org.apache.tika.parser.ParseContext;
 
   36 import org.apache.tika.parser.Parser;
 
   37 import org.apache.tika.sax.BodyContentHandler;
 
   38 import org.xml.sax.ContentHandler;
 
   39 import org.xml.sax.SAXException;
 
   50 final class PDFAttachmentExtractor {
 
   52     private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
 
   53     private final AutoDetectParser parser;
 
   55     public PDFAttachmentExtractor() {
 
   56         parser = 
new AutoDetectParser();
 
   59     public PDFAttachmentExtractor(AutoDetectParser parser) {
 
   75     public Map<String, Path> extract(InputStream input, 
long parentID, Path outputDir) 
throws IOException, SAXException, TikaException {
 
   76         ExtractionPreconditions.checkArgument(Files.exists(outputDir), 
 
   77                 String.format(
"Output directory: %s, does not exist.", outputDir.toString())); 
 
   79         ParseContext parseContext = 
new ParseContext();
 
   80         parseContext.set(Parser.class, parser);
 
   83         NewResourceWatcher watcher = 
new NewResourceWatcher();
 
   84         parseContext.set(EmbeddedDocumentExtractor.class, 
new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
 
   87         parser.parse(input, 
new BodyContentHandler(-1), 
new Metadata(), parseContext);
 
   89         return watcher.getSnapshot();
 
   96     static class EmbeddedAttachmentHandler 
implements EmbeddedDocumentExtractor {
 
   98         private final Path outputDirectory;
 
   99         private final NewResourceWatcher watcher;
 
  100         private final Long parentID;
 
  101         private Integer attachmentCount;
 
  103         public EmbeddedAttachmentHandler(Path outputDirectory, 
long parentID, NewResourceWatcher watcher) {
 
  104             this.outputDirectory = outputDirectory;
 
  105             this.watcher = watcher;
 
  106             this.parentID = parentID;
 
  111         public boolean shouldParseEmbedded(Metadata mtdt) {
 
  117         public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, 
boolean bln) 
throws SAXException, IOException {
 
  119             String uniqueExtractedName = parentID + 
"_attch_" + attachmentCount++; 
 
  121             String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
 
  122             String ext = FilenameUtils.getExtension(name);
 
  126                 name = uniqueExtractedName;
 
  127             } 
else if(!ext.isEmpty()) {
 
  128                 uniqueExtractedName += 
"." + ext;
 
  131             Path outputFile = outputDirectory.resolve(uniqueExtractedName);
 
  133             try (EncodedFileOutputStream outputStream = 
new EncodedFileOutputStream(
 
  134                     new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
 
  135                 IOUtils.copy(in, outputStream);
 
  136                 watcher.notify(name, outputFile);
 
  137             } 
catch (IOException ex) {
 
  138                 logger.log(Level.WARNING, String.format(
"Could not extract attachment %s into directory %s", 
 
  139                         uniqueExtractedName, outputFile), ex);
 
  151     static class NewResourceWatcher {
 
  153         private final Map<String, Path> newResourcePaths;
 
  155         public NewResourceWatcher() {
 
  156             newResourcePaths = 
new HashMap<>();
 
  159         public void notify(String name, Path newResource) {
 
  160             newResourcePaths.put(name, newResource);
 
  163         public Map<String, Path> getSnapshot() {
 
  164             return newResourcePaths;
 
  172     static class ExtractionPreconditions {
 
  174         public static void checkArgument(
boolean expression, String msg) 
throws IOException {
 
  176                 throw new IOException(msg);
 
  180         private ExtractionPreconditions(){