19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Paths;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
30 import java.util.logging.Level;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.poi.hwpf.usermodel.Picture;
34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
38 import org.apache.poi.hwpf.HWPFDocument;
39 import org.apache.poi.hwpf.model.PicturesTable;
40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
41 import org.apache.poi.ss.usermodel.Workbook;
42 import org.apache.poi.util.RecordFormatException;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
77 class MSOfficeEmbeddedContentExtractor {
79 private final FileManager fileManager;
80 private final IngestServices services;
81 private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
82 private final IngestJobContext context;
83 private String parentFileName;
84 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
85 private final FileTypeDetector fileTypeDetector;
87 private String moduleDirRelative;
88 private String moduleDirAbsolute;
90 private AutoDetectParser parser =
new AutoDetectParser();
91 private Detector detector = parser.getDetector();
92 private TikaConfig config = TikaConfig.getDefaultConfig();
97 enum SupportedExtractionFormats {
99 DOC(
"application/msword"),
100 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
101 PPT(
"application/vnd.ms-powerpoint"),
102 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
103 XLS(
"application/vnd.ms-excel"),
104 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
106 private final String mimeType;
108 SupportedExtractionFormats(
final String mimeType) {
109 this.mimeType = mimeType;
113 public String toString() {
114 return this.mimeType;
117 private SupportedExtractionFormats abstractFileExtractionFormat;
119 MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute)
throws NoCurrentCaseException {
121 this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
122 this.services = IngestServices.getInstance();
123 this.context = context;
124 this.fileTypeDetector = fileTypeDetector;
125 this.moduleDirRelative = moduleDirRelative;
126 this.moduleDirAbsolute = moduleDirAbsolute;
138 boolean isContentExtractionSupported(AbstractFile abstractFile) {
139 String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
140 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
141 if (s.toString().equals(abstractFileMimeType)) {
142 abstractFileExtractionFormat = s;
158 void extractEmbeddedContent(AbstractFile abstractFile) {
159 List<ExtractedFile> listOfExtractedImages = null;
160 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
161 this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
165 if (abstractFile.hasChildren()) {
167 if (
new File(getOutputFolderPath(parentFileName)).exists()) {
168 LOGGER.log(Level.INFO,
"File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName());
172 }
catch (TskCoreException e) {
173 LOGGER.log(Level.SEVERE, String.format(
"Error checking if file already has been processed, skipping: %s", parentFileName), e);
178 switch (abstractFileExtractionFormat) {
182 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
185 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
188 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
191 listOfExtractedImages = extractImagesFromXls(abstractFile);
197 if (listOfExtractedImages == null) {
201 listOfExtractedImageAbstractFiles =
new ArrayList<>();
202 for (ExtractedFile extractedImage : listOfExtractedImages) {
204 listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
205 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
206 true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
207 }
catch (TskCoreException ex) {
208 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
211 if (!listOfExtractedImages.isEmpty()) {
212 services.fireModuleContentEvent(
new ModuleContentEvent(abstractFile));
213 context.addFilesToJob(listOfExtractedImageAbstractFiles);
226 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
227 Metadata metadata =
new Metadata();
229 ParseContext parseContext =
new ParseContext();
230 parseContext.set(Parser.class, parser);
234 ContentHandler contentHandler =
new BodyContentHandler(-1);
238 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
239 officeParserConfig.setUseSAXPptxExtractor(
true);
240 officeParserConfig.setUseSAXDocxExtractor(
true);
241 parseContext.set(OfficeParserConfig.class, officeParserConfig);
243 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
244 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
245 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
248 parser.parse(stream, contentHandler, metadata, parseContext);
249 }
catch (IOException | SAXException | TikaException ex) {
250 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
254 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
265 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
266 List<Picture> listOfAllPictures;
269 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
270 PicturesTable pictureTable = doc.getPicturesTable();
271 listOfAllPictures = pictureTable.getAllPictures();
272 }
catch (IOException | IllegalArgumentException
273 | IndexOutOfBoundsException | NullPointerException ex) {
290 }
catch (Throwable ex) {
292 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.docContainer.init.err", af.getName()), ex);
296 String outputFolderPath;
297 if (listOfAllPictures.isEmpty()) {
300 outputFolderPath = getOutputFolderPath(this.parentFileName);
302 if (outputFolderPath == null) {
305 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
307 for (Picture picture : listOfAllPictures) {
308 String fileName = picture.suggestFullFileName();
310 data = picture.getContent();
311 }
catch (Exception ex) {
314 writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
316 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
319 return listOfExtractedImages;
330 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
331 List<HSLFPictureData> listOfAllPictures = null;
334 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
335 listOfAllPictures = ppt.getPictureData();
336 }
catch (IOException | IllegalArgumentException
337 | IndexOutOfBoundsException ex) {
350 }
catch (Throwable ex) {
352 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.pptContainer.init.err", af.getName()), ex);
358 String outputFolderPath;
359 if (listOfAllPictures.isEmpty()) {
362 outputFolderPath = getOutputFolderPath(this.parentFileName);
364 if (outputFolderPath == null) {
371 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
373 for (HSLFPictureData pictureData : listOfAllPictures) {
377 PictureType type = pictureData.getType();
398 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
400 data = pictureData.getData();
401 }
catch (Exception ex) {
404 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
405 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
408 return listOfExtractedImages;
419 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
420 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
423 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
424 listOfAllPictures = xls.getAllPictures();
425 }
catch (IOException | LeftoverDataException
426 | RecordFormatException | IllegalArgumentException
427 | IndexOutOfBoundsException ex) {
447 }
catch (Throwable ex) {
449 LOGGER.log(Level.SEVERE, String.format(
"%s%s", NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.xlsContainer.init.err", af.getName()), af.getName()), ex);
455 String outputFolderPath;
456 if (listOfAllPictures.isEmpty()) {
459 outputFolderPath = getOutputFolderPath(this.parentFileName);
461 if (outputFolderPath == null) {
466 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
468 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
469 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
471 data = pictureData.getData();
472 }
catch (Exception ex) {
475 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
476 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
479 return listOfExtractedImages;
490 private void writeExtractedImage(String outputPath, byte[] data) {
491 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
493 }
catch (IOException ex) {
494 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
506 private String getOutputFolderPath(String parentFileName) {
507 String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
508 File outputFilePath =
new File(outputFolderPath);
509 if (!outputFilePath.exists()) {
511 outputFilePath.mkdirs();
512 }
catch (SecurityException ex) {
513 LOGGER.log(Level.WARNING, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
517 return outputFolderPath;
529 private String getFileRelativePath(String fileName) {
531 return "/" + moduleDirRelative +
"/" + this.parentFileName +
"/" + fileName;
551 ExtractedFile(String fileName, String localPath,
long size) {
555 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
617 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
620 MediaType contentType = detector.detect(stream, metadata);
622 if (!contentType.getType().equalsIgnoreCase(
"image")
623 && !contentType.getType().equalsIgnoreCase(
"video")
624 && !contentType.getType().equalsIgnoreCase(
"application")
625 && !contentType.getType().equalsIgnoreCase(
"audio")) {
630 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
635 if (nameToExtractedFileMap.containsKey(name)) {
640 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
645 name = FilenameUtils.normalize(FilenameUtils.getName(name));
649 if (name.indexOf(
'.') == -1) {
651 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
652 }
catch (MimeTypeException ex) {
653 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
657 File extractedFile =
new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
658 byte[] fileData = IOUtils.toByteArray(stream);
659 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
660 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
669 return new ArrayList<>(nameToExtractedFileMap.values());