Autopsy  4.9.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
MSOfficeEmbeddedContentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.File;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Paths;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.logging.Level;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.poi.hwpf.usermodel.Picture;
34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
38 import org.apache.poi.hwpf.HWPFDocument;
39 import org.apache.poi.hwpf.model.PicturesTable;
40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
41 import org.apache.poi.ss.usermodel.Workbook;
42 import org.apache.poi.util.RecordFormatException;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
65 import org.sleuthkit.datamodel.AbstractFile;
66 import org.sleuthkit.datamodel.EncodedFileOutputStream;
67 import org.sleuthkit.datamodel.ReadContentInputStream;
68 import org.sleuthkit.datamodel.TskCoreException;
69 import org.sleuthkit.datamodel.TskData;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
72 
77 class MSOfficeEmbeddedContentExtractor {
78 
79  private final FileManager fileManager;
80  private final IngestServices services;
81  private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
82  private final IngestJobContext context;
83  private String parentFileName;
84  private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
85  private final FileTypeDetector fileTypeDetector;
86 
87  private String moduleDirRelative;
88  private String moduleDirAbsolute;
89 
90  private AutoDetectParser parser = new AutoDetectParser();
91  private Detector detector = parser.getDetector();
92  private TikaConfig config = TikaConfig.getDefaultConfig();
93 
97  enum SupportedExtractionFormats {
98 
99  DOC("application/msword"), //NON-NLS
100  DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
101  PPT("application/vnd.ms-powerpoint"), //NON-NLS
102  PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
103  XLS("application/vnd.ms-excel"), //NON-NLS
104  XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
105 
106  private final String mimeType;
107 
108  SupportedExtractionFormats(final String mimeType) {
109  this.mimeType = mimeType;
110  }
111 
112  @Override
113  public String toString() {
114  return this.mimeType;
115  }
116  }
117  private SupportedExtractionFormats abstractFileExtractionFormat;
118 
119  MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
120 
121  this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
122  this.services = IngestServices.getInstance();
123  this.context = context;
124  this.fileTypeDetector = fileTypeDetector;
125  this.moduleDirRelative = moduleDirRelative;
126  this.moduleDirAbsolute = moduleDirAbsolute;
127  }
128 
138  boolean isContentExtractionSupported(AbstractFile abstractFile) {
139  String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
140  for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
141  if (s.toString().equals(abstractFileMimeType)) {
142  abstractFileExtractionFormat = s;
143  return true;
144  }
145  }
146  return false;
147  }
148 
158  void extractEmbeddedContent(AbstractFile abstractFile) {
159  List<ExtractedFile> listOfExtractedImages = null;
160  List<AbstractFile> listOfExtractedImageAbstractFiles = null;
161  this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
162 
163  // Skip files that already have been unpacked.
164  try {
165  if (abstractFile.hasChildren()) {
166  //check if local unpacked dir exists
167  if (new File(getOutputFolderPath(parentFileName)).exists()) {
168  LOGGER.log(Level.INFO, "File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName()); //NON-NLS
169  return;
170  }
171  }
172  } catch (TskCoreException e) {
173  LOGGER.log(Level.SEVERE, String.format("Error checking if file already has been processed, skipping: %s", parentFileName), e); //NON-NLS
174  return;
175  }
176 
177  // Call the appropriate extraction method based on mime type
178  switch (abstractFileExtractionFormat) {
179  case DOCX:
180  case PPTX:
181  case XLSX:
182  listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
183  break;
184  case DOC:
185  listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
186  break;
187  case PPT:
188  listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
189  break;
190  case XLS:
191  listOfExtractedImages = extractImagesFromXls(abstractFile);
192  break;
193  default:
194  break;
195  }
196 
197  if (listOfExtractedImages == null) {
198  return;
199  }
200  // the common task of adding abstractFile to derivedfiles is performed.
201  listOfExtractedImageAbstractFiles = new ArrayList<>();
202  for (ExtractedFile extractedImage : listOfExtractedImages) {
203  try {
204  listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
205  extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
206  true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
207  } catch (TskCoreException ex) {
208  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
209  }
210  }
211  if (!listOfExtractedImages.isEmpty()) {
212  services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
213  context.addFilesToJob(listOfExtractedImageAbstractFiles);
214  }
215  }
216 
226  private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
227  Metadata metadata = new Metadata();
228 
229  ParseContext parseContext = new ParseContext();
230  parseContext.set(Parser.class, parser);
231 
232  // Passing -1 to the BodyContentHandler constructor disables the Tika
233  // write limit (which defaults to 100,000 characters.
234  ContentHandler contentHandler = new BodyContentHandler(-1);
235 
236  // Use the more memory efficient Tika SAX parsers for DOCX and
237  // PPTX files (it already uses SAX for XLSX).
238  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
239  officeParserConfig.setUseSAXPptxExtractor(true);
240  officeParserConfig.setUseSAXDocxExtractor(true);
241  parseContext.set(OfficeParserConfig.class, officeParserConfig);
242 
243  EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
244  parseContext.set(EmbeddedDocumentExtractor.class, extractor);
245  ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
246 
247  try {
248  parser.parse(stream, contentHandler, metadata, parseContext);
249  } catch (IOException | SAXException | TikaException ex) {
250  LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
251  return null;
252  }
253 
254  return ((EmbeddedContentExtractor) extractor).getExtractedImages();
255  }
256 
265  private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
266  List<Picture> listOfAllPictures;
267 
268  try {
269  HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
270  PicturesTable pictureTable = doc.getPicturesTable();
271  listOfAllPictures = pictureTable.getAllPictures();
272  } catch (Exception ex) {
273  // IOException:
274  // Thrown when the document has issues being read.
275 
276  // IllegalArgumentException:
277  // This will catch OldFileFormatException, which is thrown when the
278  // document's format is Word 95 or older. Alternatively, this is
279  // thrown when attempting to load an RTF file as a DOC file.
280  // However, our code verifies the file format before ever running it
281  // through the EmbeddedContentExtractor. This exception gets thrown in the
282  // "IN10-0137.E01" image regardless. The reason is unknown.
283  // IndexOutOfBoundsException:
284  // NullPointerException:
285  // These get thrown in certain images. The reason is unknown. It is
286  // likely due to problems with the file formats that POI is poorly
287  // handling.
288 
289  //Any runtime exception escaping
290  LOGGER.log(Level.WARNING, "Word document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
291  return null;
292  }
293 
294  String outputFolderPath;
295  if (listOfAllPictures.isEmpty()) {
296  return null;
297  } else {
298  outputFolderPath = getOutputFolderPath(this.parentFileName);
299  }
300  if (outputFolderPath == null) {
301  return null;
302  }
303  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
304  byte[] data = null;
305  for (Picture picture : listOfAllPictures) {
306  String fileName = picture.suggestFullFileName();
307  try {
308  data = picture.getContent();
309  } catch (Exception ex) {
310  return null;
311  }
312  writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
313  // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
314  listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
315  }
316 
317  return listOfExtractedImages;
318  }
319 
328  private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
329  List<HSLFPictureData> listOfAllPictures = null;
330 
331  try {
332  HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
333  listOfAllPictures = ppt.getPictureData();
334  } catch (Exception ex) {
335  // IllegalArgumentException:
336  // This will catch OldFileFormatException, which is thrown when the
337  // document version is unsupported. The IllegalArgumentException may
338  // also get thrown for unknown reasons.
339 
340  // IOException:
341  // Thrown when the document has issues being read.
342  // IndexOutOfBoundsException:
343  // This gets thrown in certain images. The reason is unknown. It is
344  // likely due to problems with the file formats that POI is poorly
345  // handling.
346  LOGGER.log(Level.WARNING, "PPT container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
347  return null;
348  }
349 
350  // if no images are extracted from the PPT, return null, else initialize
351  // the output folder for image extraction.
352  String outputFolderPath;
353  if (listOfAllPictures.isEmpty()) {
354  return null;
355  } else {
356  outputFolderPath = getOutputFolderPath(this.parentFileName);
357  }
358  if (outputFolderPath == null) {
359  return null;
360  }
361 
362  // extract the content to the above initialized outputFolder.
363  // extraction path - outputFolder/image_number.ext
364  int i = 0;
365  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
366  byte[] data = null;
367  for (HSLFPictureData pictureData : listOfAllPictures) {
368 
369  // Get image extension, generate image name, write image to the module
370  // output folder, add it to the listOfExtractedImageAbstractFiles
371  PictureType type = pictureData.getType();
372  String ext;
373  switch (type) {
374  case JPEG:
375  ext = ".jpg"; //NON-NLS
376  break;
377  case PNG:
378  ext = ".png"; //NON-NLS
379  break;
380  case WMF:
381  ext = ".wmf"; //NON-NLS
382  break;
383  case EMF:
384  ext = ".emf"; //NON-NLS
385  break;
386  case PICT:
387  ext = ".pict"; //NON-NLS
388  break;
389  default:
390  continue;
391  }
392  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
393  try {
394  data = pictureData.getData();
395  } catch (Exception ex) {
396  return null;
397  }
398  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
399  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
400  i++;
401  }
402  return listOfExtractedImages;
403  }
404 
413  private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
414  List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
415 
416  try {
417  Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
418  listOfAllPictures = xls.getAllPictures();
419  } catch (Exception ex) {
420  // IllegalArgumentException:
421  // This will catch OldFileFormatException, which is thrown when the
422  // document version is unsupported. The IllegalArgumentException may
423  // also get thrown for unknown reasons.
424 
425  // IOException:
426  // Thrown when the document has issues being read.
427  // LeftoverDataException:
428  // This is thrown for poorly formatted files that have more data
429  // than expected.
430  // RecordFormatException:
431  // This is thrown for poorly formatted files that have less data
432  // that expected.
433  // IllegalArgumentException:
434  // IndexOutOfBoundsException:
435  // These get thrown in certain images. The reason is unknown. It is
436  // likely due to problems with the file formats that POI is poorly
437  // handling.
438  LOGGER.log(Level.WARNING, "Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
439  return null;
440  }
441 
442  // if no images are extracted from the PPT, return null, else initialize
443  // the output folder for image extraction.
444  String outputFolderPath;
445  if (listOfAllPictures.isEmpty()) {
446  return null;
447  } else {
448  outputFolderPath = getOutputFolderPath(this.parentFileName);
449  }
450  if (outputFolderPath == null) {
451  return null;
452  }
453 
454  int i = 0;
455  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
456  byte[] data = null;
457  for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
458  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
459  try {
460  data = pictureData.getData();
461  } catch (Exception ex) {
462  return null;
463  }
464  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
465  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
466  i++;
467  }
468  return listOfExtractedImages;
469 
470  }
471 
479  private void writeExtractedImage(String outputPath, byte[] data) {
480  try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
481  fos.write(data);
482  } catch (IOException ex) {
483  LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
484  }
485  }
486 
495  private String getOutputFolderPath(String parentFileName) {
496  String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
497  File outputFilePath = new File(outputFolderPath);
498  if (!outputFilePath.exists()) {
499  try {
500  outputFilePath.mkdirs();
501  } catch (SecurityException ex) {
502  LOGGER.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
503  return null;
504  }
505  }
506  return outputFolderPath;
507  }
508 
518  private String getFileRelativePath(String fileName) {
519  // Used explicit FWD slashes to maintain DB consistency across operating systems.
520  return "/" + moduleDirRelative + "/" + this.parentFileName + "/" + fileName; //NON-NLS
521  }
522 
528  private static class ExtractedFile {
529  //String fileName, String localPath, long size, long ctime, long crtime,
530  //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
531 
532  private final String fileName;
533  private final String localPath;
534  private final long size;
535  private final long ctime;
536  private final long crtime;
537  private final long atime;
538  private final long mtime;
539 
540  ExtractedFile(String fileName, String localPath, long size) {
541  this(fileName, localPath, size, 0, 0, 0, 0);
542  }
543 
544  ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
545  this.fileName = fileName;
546  this.localPath = localPath;
547  this.size = size;
548  this.ctime = ctime;
549  this.crtime = crtime;
550  this.atime = atime;
551  this.mtime = mtime;
552  }
553 
554  public String getFileName() {
555  return fileName;
556  }
557 
558  public String getLocalPath() {
559  return localPath;
560  }
561 
562  public long getSize() {
563  return size;
564  }
565 
566  public long getCtime() {
567  return ctime;
568  }
569 
570  public long getCrtime() {
571  return crtime;
572  }
573 
574  public long getAtime() {
575  return atime;
576  }
577 
578  public long getMtime() {
579  return mtime;
580  }
581  }
582 
588  private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
589 
590  private int fileCount = 0;
591  // Map of file name to ExtractedFile instance. This can revert to a
592  // plain old list after we upgrade to Tika 1.16 or above.
593  private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
594 
595  public EmbeddedContentExtractor(ParseContext context) {
596  super(context);
597  }
598 
599  @Override
600  public boolean shouldParseEmbedded(Metadata metadata) {
601  return true;
602  }
603 
604  @Override
605  public void parseEmbedded(InputStream stream, ContentHandler handler,
606  Metadata metadata, boolean outputHtml) throws SAXException, IOException {
607 
608  // Get the mime type for the embedded document
609  MediaType contentType = detector.detect(stream, metadata);
610 
611  if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
612  && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
613  && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
614  && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
615  return;
616  }
617 
618  // try to get the name of the embedded file from the metadata
619  String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
620 
621  // TODO: This can be removed after we upgrade to Tika 1.16 or
622  // above. The 1.16 version of Tika keeps track of files that
623  // have been seen before.
624  if (nameToExtractedFileMap.containsKey(name)) {
625  return;
626  }
627 
628  if (name == null) {
629  name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
630  } else {
631  //make sure to select only the file name (not any directory paths
632  //that might be included in the name) and make sure
633  //to normalize the name
634  name = FilenameUtils.normalize(FilenameUtils.getName(name));
635  }
636 
637  // Get the suggested extension based on mime type.
638  if (name.indexOf('.') == -1) {
639  try {
640  name += config.getMimeRepository().forName(contentType.toString()).getExtension();
641  } catch (MimeTypeException ex) {
642  LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
643  }
644  }
645 
646  File extractedFile = new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
647  byte[] fileData = IOUtils.toByteArray(stream);
648  writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
649  nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
650  }
651 
657  public List<ExtractedFile> getExtractedImages() {
658  return new ArrayList<>(nameToExtractedFileMap.values());
659  }
660  }
661 }
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)

Copyright © 2012-2018 Basis Technology. Generated on: Tue Dec 18 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.