Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
PDFAttachmentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.file.Files;
25 import java.nio.file.Path;
26 import java.util.HashMap;
27 import java.util.Map;
28 import java.util.logging.Level;
29 import org.apache.commons.io.FilenameUtils;
30 import org.apache.commons.io.IOUtils;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.parser.AutoDetectParser;
35 import org.apache.tika.parser.ParseContext;
36 import org.apache.tika.parser.Parser;
37 import org.apache.tika.parser.pdf.PDFParserConfig;
38 import org.apache.tika.sax.BodyContentHandler;
39 import org.xml.sax.ContentHandler;
40 import org.xml.sax.SAXException;
42 import org.sleuthkit.datamodel.EncodedFileOutputStream;
43 import org.sleuthkit.datamodel.TskData;
44 
51 final class PDFAttachmentExtractor {
52 
53  private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
54  private final AutoDetectParser parser;
55 
56  public PDFAttachmentExtractor() {
57  parser = new AutoDetectParser();
58  }
59 
60  public PDFAttachmentExtractor(AutoDetectParser parser) {
61  this.parser = parser;
62  }
63 
76  public Map<String, NewResourceData> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
77  ExtractionPreconditions.checkArgument(Files.exists(outputDir),
78  String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
79 
80  ParseContext parseContext = new ParseContext();
81  parseContext.set(Parser.class, parser);
82 
83  PDFParserConfig pdfConfig = new PDFParserConfig();
84  pdfConfig.setExtractInlineImages(true);
85  pdfConfig.setExtractUniqueInlineImagesOnly(true);
86 
87  parseContext.set(PDFParserConfig.class, pdfConfig);
88 
89  //Keep track of the attachment files as they are being extracted and written to disk.
90  NewResourceWatcher watcher = new NewResourceWatcher();
91  parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
92 
93  //Parse input with default params, except for our ParseContext
94  parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
95 
96  return watcher.getSnapshot();
97  }
98 
103  static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
104 
105  private final Path outputDirectory;
106  private final NewResourceWatcher watcher;
107  private final Long parentID;
108  private Integer attachmentCount;
109 
110  public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
111  this.outputDirectory = outputDirectory;
112  this.watcher = watcher;
113  this.parentID = parentID;
114  attachmentCount = 0;
115  }
116 
117  @Override
118  public boolean shouldParseEmbedded(Metadata mtdt) {
119  //Grab every available attachment
120  return true;
121  }
122 
123  @Override
124  public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
125  //Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
126  String uniqueExtractedName = "extract_" + attachmentCount++; //NON-NLS
127 
128  String name = mtdt.get(Metadata.RESOURCE_NAME_KEY);
129  String ext = FilenameUtils.getExtension(name);
130 
131  //Append the extension if we can.
132  if(ext == null) {
133  name = uniqueExtractedName;
134  } else if(!ext.isEmpty()) {
135  uniqueExtractedName += "." + ext;
136  }
137 
138  Path outputFile = outputDirectory.resolve(uniqueExtractedName);
139 
140  try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
141  new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
142  int bytesCopied = IOUtils.copy(in, outputStream);
143  watcher.notify(name, outputFile, bytesCopied);
144  } catch (IOException ex) {
145  logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
146  uniqueExtractedName, outputFile), ex);
147  }
148  }
149  }
150 
156  static class NewResourceData {
157  private final Path path;
158  private final int length;
159 
160  NewResourceData(Path path, int length) {
161  this.path = path;
162  this.length = length;
163  }
164 
165  Path getPath() {
166  return path;
167  }
168 
169  int getLength() {
170  return length;
171  }
172  }
173 
181  static class NewResourceWatcher {
182 
183  private final Map<String, NewResourceData> newResourcePaths;
184 
185  public NewResourceWatcher() {
186  newResourcePaths = new HashMap<>();
187  }
188 
189  public void notify(String name, Path localPath, int length) {
190  newResourcePaths.put(name, new NewResourceData(localPath, length));
191  }
192 
193  public Map<String, NewResourceData> getSnapshot() {
194  return newResourcePaths;
195  }
196  }
197 
202  static class ExtractionPreconditions {
203 
204  public static void checkArgument(boolean expression, String msg) throws IOException {
205  if (!expression) {
206  throw new IOException(msg);
207  }
208  }
209 
210  private ExtractionPreconditions(){
211  }
212  }
213 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Feb 6 2024
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.