19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.file.Files;
25 import java.nio.file.Path;
26 import java.util.HashMap;
28 import java.util.logging.Level;
29 import org.apache.commons.io.FilenameUtils;
30 import org.apache.commons.io.IOUtils;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.parser.AutoDetectParser;
35 import org.apache.tika.parser.ParseContext;
36 import org.apache.tika.parser.Parser;
37 import org.apache.tika.parser.pdf.PDFParserConfig;
38 import org.apache.tika.sax.BodyContentHandler;
39 import org.xml.sax.ContentHandler;
40 import org.xml.sax.SAXException;
51 final class PDFAttachmentExtractor {
53 private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
54 private final AutoDetectParser parser;
56 public PDFAttachmentExtractor() {
57 parser =
new AutoDetectParser();
60 public PDFAttachmentExtractor(AutoDetectParser parser) {
76 public Map<String, NewResourceData> extract(InputStream input,
long parentID, Path outputDir)
throws IOException, SAXException, TikaException {
77 ExtractionPreconditions.checkArgument(Files.exists(outputDir),
78 String.format(
"Output directory: %s, does not exist.", outputDir.toString()));
80 ParseContext parseContext =
new ParseContext();
81 parseContext.set(Parser.class, parser);
83 PDFParserConfig pdfConfig =
new PDFParserConfig();
84 pdfConfig.setExtractInlineImages(
true);
85 pdfConfig.setExtractUniqueInlineImagesOnly(
true);
87 parseContext.set(PDFParserConfig.class, pdfConfig);
90 NewResourceWatcher watcher =
new NewResourceWatcher();
91 parseContext.set(EmbeddedDocumentExtractor.class,
new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
94 parser.parse(input,
new BodyContentHandler(-1),
new Metadata(), parseContext);
96 return watcher.getSnapshot();
103 static class EmbeddedAttachmentHandler
implements EmbeddedDocumentExtractor {
105 private final Path outputDirectory;
106 private final NewResourceWatcher watcher;
107 private final Long parentID;
108 private Integer attachmentCount;
110 public EmbeddedAttachmentHandler(Path outputDirectory,
long parentID, NewResourceWatcher watcher) {
111 this.outputDirectory = outputDirectory;
112 this.watcher = watcher;
113 this.parentID = parentID;
118 public boolean shouldParseEmbedded(Metadata mtdt) {
124 public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt,
boolean bln)
throws SAXException, IOException {
126 String uniqueExtractedName =
"extract_" + attachmentCount++;
128 String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
129 String ext = FilenameUtils.getExtension(name);
133 name = uniqueExtractedName;
134 }
else if(!ext.isEmpty()) {
135 uniqueExtractedName +=
"." + ext;
138 Path outputFile = outputDirectory.resolve(uniqueExtractedName);
140 try (EncodedFileOutputStream outputStream =
new EncodedFileOutputStream(
141 new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
142 int bytesCopied = IOUtils.copy(in, outputStream);
143 watcher.notify(name, outputFile, bytesCopied);
144 }
catch (IOException ex) {
145 logger.log(Level.WARNING, String.format(
"Could not extract attachment %s into directory %s",
146 uniqueExtractedName, outputFile), ex);
156 static class NewResourceData {
157 private final Path path;
158 private final int length;
160 NewResourceData(Path path,
int length) {
162 this.length = length;
181 static class NewResourceWatcher {
183 private final Map<String, NewResourceData> newResourcePaths;
185 public NewResourceWatcher() {
186 newResourcePaths =
new HashMap<>();
189 public void notify(String name, Path localPath,
int length) {
190 newResourcePaths.put(name,
new NewResourceData(localPath, length));
193 public Map<String, NewResourceData> getSnapshot() {
194 return newResourcePaths;
202 static class ExtractionPreconditions {
204 public static void checkArgument(
boolean expression, String msg)
throws IOException {
206 throw new IOException(msg);
210 private ExtractionPreconditions(){