19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.file.Files;
25 import java.nio.file.Path;
26 import java.util.HashMap;
28 import java.util.logging.Level;
29 import org.apache.commons.io.FilenameUtils;
30 import org.apache.commons.io.IOUtils;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.parser.AutoDetectParser;
35 import org.apache.tika.parser.ParseContext;
36 import org.apache.tika.parser.Parser;
37 import org.apache.tika.sax.BodyContentHandler;
38 import org.xml.sax.ContentHandler;
39 import org.xml.sax.SAXException;
50 final class PDFAttachmentExtractor {
52 private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
53 private final AutoDetectParser parser;
55 public PDFAttachmentExtractor() {
56 parser =
new AutoDetectParser();
59 public PDFAttachmentExtractor(AutoDetectParser parser) {
75 public Map<String, Path> extract(InputStream input,
long parentID, Path outputDir)
throws IOException, SAXException, TikaException {
76 ExtractionPreconditions.checkArgument(Files.exists(outputDir),
77 String.format(
"Output directory: %s, does not exist.", outputDir.toString()));
79 ParseContext parseContext =
new ParseContext();
80 parseContext.set(Parser.class, parser);
83 NewResourceWatcher watcher =
new NewResourceWatcher();
84 parseContext.set(EmbeddedDocumentExtractor.class,
new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
87 parser.parse(input,
new BodyContentHandler(-1),
new Metadata(), parseContext);
89 return watcher.getSnapshot();
96 static class EmbeddedAttachmentHandler
implements EmbeddedDocumentExtractor {
98 private final Path outputDirectory;
99 private final NewResourceWatcher watcher;
100 private final Long parentID;
101 private Integer attachmentCount;
103 public EmbeddedAttachmentHandler(Path outputDirectory,
long parentID, NewResourceWatcher watcher) {
104 this.outputDirectory = outputDirectory;
105 this.watcher = watcher;
106 this.parentID = parentID;
111 public boolean shouldParseEmbedded(Metadata mtdt) {
117 public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt,
boolean bln)
throws SAXException, IOException {
119 String uniqueExtractedName = parentID +
"_attch_" + attachmentCount++;
121 String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
122 String ext = FilenameUtils.getExtension(name);
126 name = uniqueExtractedName;
127 }
else if(!ext.isEmpty()) {
128 uniqueExtractedName +=
"." + ext;
131 Path outputFile = outputDirectory.resolve(uniqueExtractedName);
133 try (EncodedFileOutputStream outputStream =
new EncodedFileOutputStream(
134 new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
135 IOUtils.copy(in, outputStream);
136 watcher.notify(name, outputFile);
137 }
catch (IOException ex) {
138 logger.log(Level.WARNING, String.format(
"Could not extract attachment %s into directory %s",
139 uniqueExtractedName, outputFile), ex);
151 static class NewResourceWatcher {
153 private final Map<String, Path> newResourcePaths;
155 public NewResourceWatcher() {
156 newResourcePaths =
new HashMap<>();
159 public void notify(String name, Path newResource) {
160 newResourcePaths.put(name, newResource);
163 public Map<String, Path> getSnapshot() {
164 return newResourcePaths;
172 static class ExtractionPreconditions {
174 public static void checkArgument(
boolean expression, String msg)
throws IOException {
176 throw new IOException(msg);
180 private ExtractionPreconditions(){