19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.file.Files;
25 import java.nio.file.Path;
26 import java.util.HashMap;
28 import java.util.logging.Level;
29 import org.apache.commons.io.FilenameUtils;
30 import org.apache.commons.io.IOUtils;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
33 import org.apache.tika.metadata.TikaCoreProperties;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.parser.AutoDetectParser;
36 import org.apache.tika.parser.ParseContext;
37 import org.apache.tika.parser.Parser;
38 import org.apache.tika.parser.pdf.PDFParserConfig;
39 import org.apache.tika.sax.BodyContentHandler;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.SAXException;
52 final class PDFAttachmentExtractor {
54 private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
55 private final AutoDetectParser parser;
57 public PDFAttachmentExtractor() {
58 parser =
new AutoDetectParser();
61 public PDFAttachmentExtractor(AutoDetectParser parser) {
77 public Map<String, NewResourceData> extract(InputStream input,
long parentID, Path outputDir)
throws IOException, SAXException, TikaException {
78 ExtractionPreconditions.checkArgument(Files.exists(outputDir),
79 String.format(
"Output directory: %s, does not exist.", outputDir.toString()));
81 ParseContext parseContext =
new ParseContext();
82 parseContext.set(Parser.class, parser);
84 PDFParserConfig pdfConfig =
new PDFParserConfig();
85 pdfConfig.setExtractInlineImages(
true);
86 pdfConfig.setExtractUniqueInlineImagesOnly(
true);
88 parseContext.set(PDFParserConfig.class, pdfConfig);
91 NewResourceWatcher watcher =
new NewResourceWatcher();
92 parseContext.set(EmbeddedDocumentExtractor.class,
new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
95 parser.parse(input,
new BodyContentHandler(-1),
new Metadata(), parseContext);
97 return watcher.getSnapshot();
104 static class EmbeddedAttachmentHandler
implements EmbeddedDocumentExtractor {
106 private final Path outputDirectory;
107 private final NewResourceWatcher watcher;
108 private final Long parentID;
109 private Integer attachmentCount;
111 public EmbeddedAttachmentHandler(Path outputDirectory,
long parentID, NewResourceWatcher watcher) {
112 this.outputDirectory = outputDirectory;
113 this.watcher = watcher;
114 this.parentID = parentID;
119 public boolean shouldParseEmbedded(Metadata mtdt) {
125 public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt,
boolean bln)
throws SAXException, IOException {
127 String uniqueExtractedName =
"extract_" + attachmentCount++;
129 String name = mtdt.get(TikaCoreProperties.RESOURCE_NAME_KEY);
130 String ext = FilenameUtils.getExtension(name);
134 name = uniqueExtractedName;
135 }
else if(!ext.isEmpty()) {
136 uniqueExtractedName +=
"." + ext;
139 Path outputFile = outputDirectory.resolve(uniqueExtractedName);
141 try (EncodedFileOutputStream outputStream =
new EncodedFileOutputStream(
142 new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
143 int bytesCopied = IOUtils.copy(in, outputStream);
144 watcher.notify(name, outputFile, bytesCopied);
145 }
catch (IOException ex) {
146 logger.log(Level.WARNING, String.format(
"Could not extract attachment %s into directory %s",
147 uniqueExtractedName, outputFile), ex);
157 static class NewResourceData {
158 private final Path path;
159 private final int length;
161 NewResourceData(Path path,
int length) {
163 this.length = length;
182 static class NewResourceWatcher {
184 private final Map<String, NewResourceData> newResourcePaths;
186 public NewResourceWatcher() {
187 newResourcePaths =
new HashMap<>();
190 public void notify(String name, Path localPath,
int length) {
191 newResourcePaths.put(name,
new NewResourceData(localPath, length));
194 public Map<String, NewResourceData> getSnapshot() {
195 return newResourcePaths;
203 static class ExtractionPreconditions {
205 public static void checkArgument(
boolean expression, String msg)
throws IOException {
207 throw new IOException(msg);
211 private ExtractionPreconditions(){