Autopsy  4.10.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
MSOfficeEmbeddedContentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.File;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Paths;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.logging.Level;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.poi.hwpf.usermodel.Picture;
34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
38 import org.apache.poi.hwpf.HWPFDocument;
39 import org.apache.poi.hwpf.model.PicturesTable;
40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
41 import org.apache.poi.ss.usermodel.Workbook;
42 import org.apache.poi.util.RecordFormatException;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
65 import org.sleuthkit.datamodel.AbstractFile;
66 import org.sleuthkit.datamodel.EncodedFileOutputStream;
67 import org.sleuthkit.datamodel.ReadContentInputStream;
68 import org.sleuthkit.datamodel.TskCoreException;
69 import org.sleuthkit.datamodel.TskData;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
72 
77 class MSOfficeEmbeddedContentExtractor {
78 
79  private final FileManager fileManager;
80  private final IngestServices services;
81  private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
82  private final IngestJobContext context;
83  private String parentFileName;
84  private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
85  private final FileTypeDetector fileTypeDetector;
86 
87  private String moduleDirRelative;
88  private String moduleDirAbsolute;
89 
90  private AutoDetectParser parser = new AutoDetectParser();
91  private Detector detector = parser.getDetector();
92  private TikaConfig config = TikaConfig.getDefaultConfig();
93 
97  enum SupportedExtractionFormats {
98 
99  DOC("application/msword"), //NON-NLS
100  DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
101  PPT("application/vnd.ms-powerpoint"), //NON-NLS
102  PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
103  XLS("application/vnd.ms-excel"), //NON-NLS
104  XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); //NON-NLS
105 
106  private final String mimeType;
107 
108  SupportedExtractionFormats(final String mimeType) {
109  this.mimeType = mimeType;
110  }
111 
112  @Override
113  public String toString() {
114  return this.mimeType;
115  }
116  }
117  private SupportedExtractionFormats abstractFileExtractionFormat;
118 
119  MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute) throws NoCurrentCaseException {
120 
121  this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
122  this.services = IngestServices.getInstance();
123  this.context = context;
124  this.fileTypeDetector = fileTypeDetector;
125  this.moduleDirRelative = moduleDirRelative;
126  this.moduleDirAbsolute = moduleDirAbsolute;
127  }
128 
138  boolean isContentExtractionSupported(AbstractFile abstractFile) {
139  String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
140  for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
141  if (s.toString().equals(abstractFileMimeType)) {
142  abstractFileExtractionFormat = s;
143  return true;
144  }
145  }
146  return false;
147  }
148 
158  void extractEmbeddedContent(AbstractFile abstractFile) {
159  List<ExtractedFile> listOfExtractedImages = null;
160  List<AbstractFile> listOfExtractedImageAbstractFiles = null;
161  this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
162 
163  // Skip files that already have been unpacked.
164  try {
165  if (abstractFile.hasChildren()) {
166  //check if local unpacked dir exists
167  if (new File(getOutputFolderPath(parentFileName)).exists()) {
168  LOGGER.log(Level.INFO, "File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName()); //NON-NLS
169  return;
170  }
171  }
172  } catch (TskCoreException e) {
173  LOGGER.log(Level.SEVERE, String.format("Error checking if file already has been processed, skipping: %s", parentFileName), e); //NON-NLS
174  return;
175  }
176 
177  // Call the appropriate extraction method based on mime type
178  switch (abstractFileExtractionFormat) {
179  case DOCX:
180  case PPTX:
181  case XLSX:
182  listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
183  break;
184  case DOC:
185  listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
186  break;
187  case PPT:
188  listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
189  break;
190  case XLS:
191  listOfExtractedImages = extractImagesFromXls(abstractFile);
192  break;
193  default:
194  break;
195  }
196 
197  if (listOfExtractedImages == null) {
198  return;
199  }
200  // the common task of adding abstractFile to derivedfiles is performed.
201  listOfExtractedImageAbstractFiles = new ArrayList<>();
202  for (ExtractedFile extractedImage : listOfExtractedImages) {
203  try {
204  listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
205  extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
206  true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
207  } catch (TskCoreException ex) {
208  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
209  }
210  }
211  if (!listOfExtractedImages.isEmpty()) {
212  services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
213  context.addFilesToJob(listOfExtractedImageAbstractFiles);
214  }
215  }
216 
226  private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
227  Metadata metadata = new Metadata();
228 
229  ParseContext parseContext = new ParseContext();
230  parseContext.set(Parser.class, parser);
231 
232  // Passing -1 to the BodyContentHandler constructor disables the Tika
233  // write limit (which defaults to 100,000 characters.
234  ContentHandler contentHandler = new BodyContentHandler(-1);
235 
236  // Use the more memory efficient Tika SAX parsers for DOCX and
237  // PPTX files (it already uses SAX for XLSX).
238  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
239  officeParserConfig.setUseSAXPptxExtractor(true);
240  officeParserConfig.setUseSAXDocxExtractor(true);
241  parseContext.set(OfficeParserConfig.class, officeParserConfig);
242 
243  EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
244  parseContext.set(EmbeddedDocumentExtractor.class, extractor);
245  ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
246 
247  try {
248  parser.parse(stream, contentHandler, metadata, parseContext);
249  } catch (IOException | SAXException | TikaException ex) {
250  LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
251  return null;
252  }
253 
254  return ((EmbeddedContentExtractor) extractor).getExtractedImages();
255  }
256 
265  private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
266  List<Picture> listOfAllPictures;
267 
268  try {
269  HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
270  PicturesTable pictureTable = doc.getPicturesTable();
271  listOfAllPictures = pictureTable.getAllPictures();
272  } catch (Exception ex) {
273  // IOException:
274  // Thrown when the document has issues being read.
275 
276  // IllegalArgumentException:
277  // This will catch OldFileFormatException, which is thrown when the
278  // document's format is Word 95 or older. Alternatively, this is
279  // thrown when attempting to load an RTF file as a DOC file.
280  // However, our code verifies the file format before ever running it
281  // through the EmbeddedContentExtractor. This exception gets thrown in the
282  // "IN10-0137.E01" image regardless. The reason is unknown.
283  // IndexOutOfBoundsException:
284  // NullPointerException:
285  // These get thrown in certain images. The reason is unknown. It is
286  // likely due to problems with the file formats that POI is poorly
287  // handling.
288 
289  //Any runtime exception escaping
290  LOGGER.log(Level.WARNING, "Word document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
291  return null;
292  }
293 
294  String outputFolderPath;
295  if (listOfAllPictures.isEmpty()) {
296  return null;
297  } else {
298  outputFolderPath = getOutputFolderPath(this.parentFileName);
299  }
300  if (outputFolderPath == null) {
301  return null;
302  }
303  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
304  byte[] data = null;
305  int pictureNumber = 0; //added to ensure uniqueness in cases where suggestFullFileName returns duplicates
306  for (Picture picture : listOfAllPictures) {
307  String fileName = UNKNOWN_IMAGE_NAME_PREFIX +pictureNumber +"."+ picture.suggestFileExtension();
308  try {
309  data = picture.getContent();
310  } catch (Exception ex) {
311  return null;
312  }
313  writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
314  // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
315  listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
316  pictureNumber++;
317  }
318 
319  return listOfExtractedImages;
320  }
321 
330  private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
331  List<HSLFPictureData> listOfAllPictures = null;
332 
333  try {
334  HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
335  listOfAllPictures = ppt.getPictureData();
336  } catch (Exception ex) {
337  // IllegalArgumentException:
338  // This will catch OldFileFormatException, which is thrown when the
339  // document version is unsupported. The IllegalArgumentException may
340  // also get thrown for unknown reasons.
341 
342  // IOException:
343  // Thrown when the document has issues being read.
344  // IndexOutOfBoundsException:
345  // This gets thrown in certain images. The reason is unknown. It is
346  // likely due to problems with the file formats that POI is poorly
347  // handling.
348  LOGGER.log(Level.WARNING, "PPT container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
349  return null;
350  }
351 
352  // if no images are extracted from the PPT, return null, else initialize
353  // the output folder for image extraction.
354  String outputFolderPath;
355  if (listOfAllPictures.isEmpty()) {
356  return null;
357  } else {
358  outputFolderPath = getOutputFolderPath(this.parentFileName);
359  }
360  if (outputFolderPath == null) {
361  return null;
362  }
363 
364  // extract the content to the above initialized outputFolder.
365  // extraction path - outputFolder/image_number.ext
366  int i = 0;
367  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
368  byte[] data = null;
369  for (HSLFPictureData pictureData : listOfAllPictures) {
370 
371  // Get image extension, generate image name, write image to the module
372  // output folder, add it to the listOfExtractedImageAbstractFiles
373  PictureType type = pictureData.getType();
374  String ext;
375  switch (type) {
376  case JPEG:
377  ext = ".jpg"; //NON-NLS
378  break;
379  case PNG:
380  ext = ".png"; //NON-NLS
381  break;
382  case WMF:
383  ext = ".wmf"; //NON-NLS
384  break;
385  case EMF:
386  ext = ".emf"; //NON-NLS
387  break;
388  case PICT:
389  ext = ".pict"; //NON-NLS
390  break;
391  default:
392  continue;
393  }
394  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
395  try {
396  data = pictureData.getData();
397  } catch (Exception ex) {
398  return null;
399  }
400  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
401  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
402  i++;
403  }
404  return listOfExtractedImages;
405  }
406 
415  private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
416  List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
417 
418  try {
419  Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
420  listOfAllPictures = xls.getAllPictures();
421  } catch (Exception ex) {
422  // IllegalArgumentException:
423  // This will catch OldFileFormatException, which is thrown when the
424  // document version is unsupported. The IllegalArgumentException may
425  // also get thrown for unknown reasons.
426 
427  // IOException:
428  // Thrown when the document has issues being read.
429  // LeftoverDataException:
430  // This is thrown for poorly formatted files that have more data
431  // than expected.
432  // RecordFormatException:
433  // This is thrown for poorly formatted files that have less data
434  // that expected.
435  // IllegalArgumentException:
436  // IndexOutOfBoundsException:
437  // These get thrown in certain images. The reason is unknown. It is
438  // likely due to problems with the file formats that POI is poorly
439  // handling.
440  LOGGER.log(Level.WARNING, "Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
441  return null;
442  }
443 
444  // if no images are extracted from the PPT, return null, else initialize
445  // the output folder for image extraction.
446  String outputFolderPath;
447  if (listOfAllPictures.isEmpty()) {
448  return null;
449  } else {
450  outputFolderPath = getOutputFolderPath(this.parentFileName);
451  }
452  if (outputFolderPath == null) {
453  return null;
454  }
455 
456  int i = 0;
457  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
458  byte[] data = null;
459  for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
460  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
461  try {
462  data = pictureData.getData();
463  } catch (Exception ex) {
464  return null;
465  }
466  writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
467  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
468  i++;
469  }
470  return listOfExtractedImages;
471 
472  }
473 
481  private void writeExtractedImage(String outputPath, byte[] data) {
482  try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
483  fos.write(data);
484  } catch (IOException ex) {
485  LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
486  }
487  }
488 
497  private String getOutputFolderPath(String parentFileName) {
498  String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
499  File outputFilePath = new File(outputFolderPath);
500  if (!outputFilePath.exists()) {
501  try {
502  outputFilePath.mkdirs();
503  } catch (SecurityException ex) {
504  LOGGER.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
505  return null;
506  }
507  }
508  return outputFolderPath;
509  }
510 
520  private String getFileRelativePath(String fileName) {
521  return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
522  }
523 
529  private static class ExtractedFile {
530  //String fileName, String localPath, long size, long ctime, long crtime,
531  //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
532 
533  private final String fileName;
534  private final String localPath;
535  private final long size;
536  private final long ctime;
537  private final long crtime;
538  private final long atime;
539  private final long mtime;
540 
541  ExtractedFile(String fileName, String localPath, long size) {
542  this(fileName, localPath, size, 0, 0, 0, 0);
543  }
544 
545  ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
546  this.fileName = fileName;
547  this.localPath = localPath;
548  this.size = size;
549  this.ctime = ctime;
550  this.crtime = crtime;
551  this.atime = atime;
552  this.mtime = mtime;
553  }
554 
555  public String getFileName() {
556  return fileName;
557  }
558 
559  public String getLocalPath() {
560  return localPath;
561  }
562 
563  public long getSize() {
564  return size;
565  }
566 
567  public long getCtime() {
568  return ctime;
569  }
570 
571  public long getCrtime() {
572  return crtime;
573  }
574 
575  public long getAtime() {
576  return atime;
577  }
578 
579  public long getMtime() {
580  return mtime;
581  }
582  }
583 
589  private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
590 
591  private int fileCount = 0;
592  // Map of file name to ExtractedFile instance. This can revert to a
593  // plain old list after we upgrade to Tika 1.16 or above.
594  private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
595 
596  public EmbeddedContentExtractor(ParseContext context) {
597  super(context);
598  }
599 
600  @Override
601  public boolean shouldParseEmbedded(Metadata metadata) {
602  return true;
603  }
604 
605  @Override
606  public void parseEmbedded(InputStream stream, ContentHandler handler,
607  Metadata metadata, boolean outputHtml) throws SAXException, IOException {
608 
609  // Get the mime type for the embedded document
610  MediaType contentType = detector.detect(stream, metadata);
611 
612  if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
613  && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
614  && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
615  && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
616  return;
617  }
618 
619  // try to get the name of the embedded file from the metadata
620  String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
621 
622  // TODO: This can be removed after we upgrade to Tika 1.16 or
623  // above. The 1.16 version of Tika keeps track of files that
624  // have been seen before.
625  if (nameToExtractedFileMap.containsKey(name)) {
626  return;
627  }
628 
629  if (name == null) {
630  name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
631  } else {
632  //make sure to select only the file name (not any directory paths
633  //that might be included in the name) and make sure
634  //to normalize the name
635  name = FilenameUtils.normalize(FilenameUtils.getName(name));
636  }
637 
638  // Get the suggested extension based on mime type.
639  if (name.indexOf('.') == -1) {
640  try {
641  name += config.getMimeRepository().forName(contentType.toString()).getExtension();
642  } catch (MimeTypeException ex) {
643  LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
644  }
645  }
646 
647  File extractedFile = new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
648  byte[] fileData = IOUtils.toByteArray(stream);
649  writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
650  nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
651  }
652 
658  public List<ExtractedFile> getExtractedImages() {
659  return new ArrayList<>(nameToExtractedFileMap.values());
660  }
661  }
662 }
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)

Copyright © 2012-2018 Basis Technology. Generated on: Fri Mar 22 2019
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.