19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Paths;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
30 import java.util.logging.Level;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.poi.hwpf.usermodel.Picture;
34 import org.apache.poi.hslf.usermodel.HSLFPictureData;
35 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
36 import org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException;
37 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
38 import org.apache.poi.hwpf.HWPFDocument;
39 import org.apache.poi.hwpf.model.PicturesTable;
40 import org.apache.poi.sl.usermodel.PictureData.PictureType;
41 import org.apache.poi.ss.usermodel.Workbook;
42 import org.apache.poi.util.RecordFormatException;
43 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.detect.Detector;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
48 import org.apache.tika.metadata.Metadata;
49 import org.apache.tika.mime.MediaType;
50 import org.apache.tika.mime.MimeTypeException;
51 import org.apache.tika.parser.AutoDetectParser;
52 import org.apache.tika.parser.ParseContext;
53 import org.apache.tika.parser.Parser;
54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
55 import org.apache.tika.sax.BodyContentHandler;
56 import org.openide.util.NbBundle;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.SAXException;
77 class MSOfficeEmbeddedContentExtractor {
79 private final FileManager fileManager;
80 private final IngestServices services;
81 private static final Logger LOGGER = Logger.getLogger(MSOfficeEmbeddedContentExtractor.class.getName());
82 private final IngestJobContext context;
83 private String parentFileName;
84 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
85 private final FileTypeDetector fileTypeDetector;
87 private String moduleDirRelative;
88 private String moduleDirAbsolute;
90 private AutoDetectParser parser =
new AutoDetectParser();
91 private Detector detector = parser.getDetector();
92 private TikaConfig config = TikaConfig.getDefaultConfig();
97 enum SupportedExtractionFormats {
99 DOC(
"application/msword"),
100 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
101 PPT(
"application/vnd.ms-powerpoint"),
102 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
103 XLS(
"application/vnd.ms-excel"),
104 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
106 private final String mimeType;
108 SupportedExtractionFormats(
final String mimeType) {
109 this.mimeType = mimeType;
113 public String toString() {
114 return this.mimeType;
117 private SupportedExtractionFormats abstractFileExtractionFormat;
119 MSOfficeEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute)
throws NoCurrentCaseException {
121 this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
122 this.services = IngestServices.getInstance();
123 this.context = context;
124 this.fileTypeDetector = fileTypeDetector;
125 this.moduleDirRelative = moduleDirRelative;
126 this.moduleDirAbsolute = moduleDirAbsolute;
138 boolean isContentExtractionSupported(AbstractFile abstractFile) {
139 String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
140 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
141 if (s.toString().equals(abstractFileMimeType)) {
142 abstractFileExtractionFormat = s;
158 void extractEmbeddedContent(AbstractFile abstractFile) {
159 List<ExtractedFile> listOfExtractedImages = null;
160 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
161 this.parentFileName = EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile);
165 if (abstractFile.hasChildren()) {
167 if (
new File(getOutputFolderPath(parentFileName)).exists()) {
168 LOGGER.log(Level.INFO,
"File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName());
172 }
catch (TskCoreException e) {
173 LOGGER.log(Level.SEVERE, String.format(
"Error checking if file already has been processed, skipping: %s", parentFileName), e);
178 switch (abstractFileExtractionFormat) {
182 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
185 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
188 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
191 listOfExtractedImages = extractImagesFromXls(abstractFile);
197 if (listOfExtractedImages == null) {
201 listOfExtractedImageAbstractFiles =
new ArrayList<>();
202 for (ExtractedFile extractedImage : listOfExtractedImages) {
204 listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
205 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
206 true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
207 }
catch (TskCoreException ex) {
208 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
211 if (!listOfExtractedImages.isEmpty()) {
212 services.fireModuleContentEvent(
new ModuleContentEvent(abstractFile));
213 context.addFilesToJob(listOfExtractedImageAbstractFiles);
226 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
227 Metadata metadata =
new Metadata();
229 ParseContext parseContext =
new ParseContext();
230 parseContext.set(Parser.class, parser);
234 ContentHandler contentHandler =
new BodyContentHandler(-1);
238 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
239 officeParserConfig.setUseSAXPptxExtractor(
true);
240 officeParserConfig.setUseSAXDocxExtractor(
true);
241 parseContext.set(OfficeParserConfig.class, officeParserConfig);
243 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
244 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
245 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
248 parser.parse(stream, contentHandler, metadata, parseContext);
249 }
catch (IOException | SAXException | TikaException ex) {
250 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
254 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
265 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
266 List<Picture> listOfAllPictures;
269 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
270 PicturesTable pictureTable = doc.getPicturesTable();
271 listOfAllPictures = pictureTable.getAllPictures();
272 }
catch (Exception ex) {
290 LOGGER.log(Level.WARNING,
"Word document container could not be initialized. Reason: {0}", ex.getMessage());
294 String outputFolderPath;
295 if (listOfAllPictures.isEmpty()) {
298 outputFolderPath = getOutputFolderPath(this.parentFileName);
300 if (outputFolderPath == null) {
303 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
305 for (Picture picture : listOfAllPictures) {
306 String fileName = picture.suggestFullFileName();
308 data = picture.getContent();
309 }
catch (Exception ex) {
312 writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
314 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
317 return listOfExtractedImages;
328 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
329 List<HSLFPictureData> listOfAllPictures = null;
332 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
333 listOfAllPictures = ppt.getPictureData();
334 }
catch (Exception ex) {
346 LOGGER.log(Level.WARNING,
"PPT container could not be initialized. Reason: {0}", ex.getMessage());
352 String outputFolderPath;
353 if (listOfAllPictures.isEmpty()) {
356 outputFolderPath = getOutputFolderPath(this.parentFileName);
358 if (outputFolderPath == null) {
365 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
367 for (HSLFPictureData pictureData : listOfAllPictures) {
371 PictureType type = pictureData.getType();
392 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
394 data = pictureData.getData();
395 }
catch (Exception ex) {
398 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
399 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
402 return listOfExtractedImages;
413 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
414 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
417 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
418 listOfAllPictures = xls.getAllPictures();
419 }
catch (Exception ex) {
438 LOGGER.log(Level.WARNING,
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage());
444 String outputFolderPath;
445 if (listOfAllPictures.isEmpty()) {
448 outputFolderPath = getOutputFolderPath(this.parentFileName);
450 if (outputFolderPath == null) {
455 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
457 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
458 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
460 data = pictureData.getData();
461 }
catch (Exception ex) {
464 writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
465 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
468 return listOfExtractedImages;
479 private void writeExtractedImage(String outputPath, byte[] data) {
480 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
482 }
catch (IOException ex) {
483 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
495 private String getOutputFolderPath(String parentFileName) {
496 String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
497 File outputFilePath =
new File(outputFolderPath);
498 if (!outputFilePath.exists()) {
500 outputFilePath.mkdirs();
501 }
catch (SecurityException ex) {
502 LOGGER.log(Level.WARNING, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
506 return outputFolderPath;
518 private String getFileRelativePath(String fileName) {
520 return "/" + moduleDirRelative +
"/" + this.parentFileName +
"/" + fileName;
540 ExtractedFile(String fileName, String localPath,
long size) {
544 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
606 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
609 MediaType contentType = detector.detect(stream, metadata);
611 if (!contentType.getType().equalsIgnoreCase(
"image")
612 && !contentType.getType().equalsIgnoreCase(
"video")
613 && !contentType.getType().equalsIgnoreCase(
"application")
614 && !contentType.getType().equalsIgnoreCase(
"audio")) {
619 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
624 if (nameToExtractedFileMap.containsKey(name)) {
629 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
634 name = FilenameUtils.normalize(FilenameUtils.getName(name));
638 if (name.indexOf(
'.') == -1) {
640 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
641 }
catch (MimeTypeException ex) {
642 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
646 File extractedFile =
new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
647 byte[] fileData = IOUtils.toByteArray(stream);
648 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
649 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
658 return new ArrayList<>(nameToExtractedFileMap.values());