Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
PDFAttachmentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.file.Files;
25 import java.nio.file.Path;
26 import java.util.HashMap;
27 import java.util.Map;
28 import java.util.logging.Level;
29 import org.apache.commons.io.FilenameUtils;
30 import org.apache.commons.io.IOUtils;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
33 import org.apache.tika.metadata.TikaCoreProperties;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.parser.AutoDetectParser;
36 import org.apache.tika.parser.ParseContext;
37 import org.apache.tika.parser.Parser;
38 import org.apache.tika.parser.pdf.PDFParserConfig;
39 import org.apache.tika.sax.BodyContentHandler;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.SAXException;
45 
52 final class PDFAttachmentExtractor {
53 
54  private static final Logger logger = Logger.getLogger(PDFAttachmentExtractor.class.getName());
55  private final AutoDetectParser parser;
56 
57  public PDFAttachmentExtractor() {
58  parser = new AutoDetectParser();
59  }
60 
61  public PDFAttachmentExtractor(AutoDetectParser parser) {
62  this.parser = parser;
63  }
64 
77  public Map<String, NewResourceData> extract(InputStream input, long parentID, Path outputDir) throws IOException, SAXException, TikaException {
78  ExtractionPreconditions.checkArgument(Files.exists(outputDir),
79  String.format("Output directory: %s, does not exist.", outputDir.toString())); //NON-NLS
80 
81  ParseContext parseContext = new ParseContext();
82  parseContext.set(Parser.class, parser);
83 
84  PDFParserConfig pdfConfig = new PDFParserConfig();
85  pdfConfig.setExtractInlineImages(true);
86  pdfConfig.setExtractUniqueInlineImagesOnly(true);
87 
88  parseContext.set(PDFParserConfig.class, pdfConfig);
89 
90  //Keep track of the attachment files as they are being extracted and written to disk.
91  NewResourceWatcher watcher = new NewResourceWatcher();
92  parseContext.set(EmbeddedDocumentExtractor.class, new EmbeddedAttachmentHandler(outputDir, parentID, watcher));
93 
94  //Parse input with default params, except for our ParseContext
95  parser.parse(input, new BodyContentHandler(-1), new Metadata(), parseContext);
96 
97  return watcher.getSnapshot();
98  }
99 
104  static class EmbeddedAttachmentHandler implements EmbeddedDocumentExtractor {
105 
106  private final Path outputDirectory;
107  private final NewResourceWatcher watcher;
108  private final Long parentID;
109  private Integer attachmentCount;
110 
111  public EmbeddedAttachmentHandler(Path outputDirectory, long parentID, NewResourceWatcher watcher) {
112  this.outputDirectory = outputDirectory;
113  this.watcher = watcher;
114  this.parentID = parentID;
115  attachmentCount = 0;
116  }
117 
118  @Override
119  public boolean shouldParseEmbedded(Metadata mtdt) {
120  //Grab every available attachment
121  return true;
122  }
123 
124  @Override
125  public void parseEmbedded(InputStream in, ContentHandler ch, Metadata mtdt, boolean bln) throws SAXException, IOException {
126  //Resource naming scheme is used internally in autopsy, therefore we can guarentee uniqueness.
127  String uniqueExtractedName = "extract_" + attachmentCount++; //NON-NLS
128 
129  String name = mtdt.get(TikaCoreProperties.RESOURCE_NAME_KEY);
130  String ext = FilenameUtils.getExtension(name);
131 
132  //Append the extension if we can.
133  if(ext == null) {
134  name = uniqueExtractedName;
135  } else if(!ext.isEmpty()) {
136  uniqueExtractedName += "." + ext;
137  }
138 
139  Path outputFile = outputDirectory.resolve(uniqueExtractedName);
140 
141  try (EncodedFileOutputStream outputStream = new EncodedFileOutputStream(
142  new FileOutputStream(outputFile.toFile()), TskData.EncodingType.XOR1)){
143  int bytesCopied = IOUtils.copy(in, outputStream);
144  watcher.notify(name, outputFile, bytesCopied);
145  } catch (IOException ex) {
146  logger.log(Level.WARNING, String.format("Could not extract attachment %s into directory %s", //NON-NLS
147  uniqueExtractedName, outputFile), ex);
148  }
149  }
150  }
151 
157  static class NewResourceData {
158  private final Path path;
159  private final int length;
160 
161  NewResourceData(Path path, int length) {
162  this.path = path;
163  this.length = length;
164  }
165 
166  Path getPath() {
167  return path;
168  }
169 
170  int getLength() {
171  return length;
172  }
173  }
174 
182  static class NewResourceWatcher {
183 
184  private final Map<String, NewResourceData> newResourcePaths;
185 
186  public NewResourceWatcher() {
187  newResourcePaths = new HashMap<>();
188  }
189 
190  public void notify(String name, Path localPath, int length) {
191  newResourcePaths.put(name, new NewResourceData(localPath, length));
192  }
193 
194  public Map<String, NewResourceData> getSnapshot() {
195  return newResourcePaths;
196  }
197  }
198 
203  static class ExtractionPreconditions {
204 
205  public static void checkArgument(boolean expression, String msg) throws IOException {
206  if (!expression) {
207  throw new IOException(msg);
208  }
209  }
210 
211  private ExtractionPreconditions(){
212  }
213  }
214 }

Copyright © 2012-2024 Sleuth Kit Labs. Generated on: Mon Mar 17 2025
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.