19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.InvalidPathException;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
35 import java.util.Map.Entry;
36 import java.util.logging.Level;
37 import org.apache.commons.io.FilenameUtils;
38 import org.apache.commons.io.IOUtils;
39 import org.apache.poi.hwpf.usermodel.Picture;
40 import org.apache.poi.hslf.usermodel.HSLFPictureData;
41 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
42 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
43 import org.apache.poi.hwpf.HWPFDocument;
44 import org.apache.poi.hwpf.model.PicturesTable;
45 import org.apache.poi.sl.usermodel.PictureData.PictureType;
46 import org.apache.poi.ss.usermodel.Workbook;
47 import org.apache.tika.config.TikaConfig;
48 import org.apache.tika.detect.Detector;
49 import org.apache.tika.exception.TikaException;
50 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
51 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
52 import org.apache.tika.metadata.Metadata;
53 import org.apache.tika.mime.MediaType;
54 import org.apache.tika.mime.MimeTypeException;
55 import org.apache.tika.parser.AutoDetectParser;
56 import org.apache.tika.parser.ParseContext;
57 import org.apache.tika.parser.Parser;
58 import org.apache.tika.parser.microsoft.OfficeParserConfig;
59 import org.apache.tika.sax.BodyContentHandler;
60 import org.openide.util.NbBundle;
76 import org.xml.sax.ContentHandler;
77 import org.xml.sax.SAXException;
83 class DocumentEmbeddedContentExtractor {
87 private static final Logger LOGGER =
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
89 private String parentFileName;
90 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
92 private final FileTaskExecutor fileTaskExecutor;
94 private String moduleDirRelative;
95 private String moduleDirAbsolute;
97 private AutoDetectParser parser =
new AutoDetectParser();
98 private Detector detector = parser.getDetector();
99 private TikaConfig config = TikaConfig.getDefaultConfig();
104 enum SupportedExtractionFormats {
106 DOC(
"application/msword"),
107 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
108 PPT(
"application/vnd.ms-powerpoint"),
109 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
110 XLS(
"application/vnd.ms-excel"),
111 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
112 PDF(
"application/pdf");
114 private final String mimeType;
116 SupportedExtractionFormats(
final String mimeType) {
117 this.mimeType = mimeType;
121 public String toString() {
122 return this.mimeType;
125 private SupportedExtractionFormats abstractFileExtractionFormat;
131 this.context = context;
132 this.fileTypeDetector = fileTypeDetector;
133 this.moduleDirRelative = moduleDirRelative;
134 this.moduleDirAbsolute = moduleDirAbsolute;
135 this.fileTaskExecutor = fileTaskExecutor;
147 boolean isContentExtractionSupported(AbstractFile abstractFile) {
148 String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
149 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
150 if (checkForIngestCancellation(abstractFile)) {
153 if (s.toString().equals(abstractFileMimeType)) {
154 abstractFileExtractionFormat = s;
172 private boolean checkForIngestCancellation(AbstractFile file) {
174 LOGGER.log(Level.INFO,
"Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}",
new Object[]{file.getName(), file.getId()});
189 void extractEmbeddedContent(AbstractFile abstractFile) {
190 List<ExtractedFile> listOfExtractedImages = null;
191 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
204 if (abstractFile.hasChildren()) {
206 File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
207 if (fileTaskExecutor.exists(outputFolder)) {
211 }
catch (TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
212 LOGGER.log(Level.SEVERE, String.format(
"Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e);
215 if (checkForIngestCancellation(abstractFile)) {
219 switch (abstractFileExtractionFormat) {
223 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
226 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
229 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
232 listOfExtractedImages = extractImagesFromXls(abstractFile);
235 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
241 if (listOfExtractedImages == null) {
245 listOfExtractedImageAbstractFiles =
new ArrayList<>();
246 for (ExtractedFile extractedImage : listOfExtractedImages) {
247 if (checkForIngestCancellation(abstractFile)) {
251 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
252 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
254 }
catch (TskCoreException ex) {
255 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
258 if (!listOfExtractedImages.isEmpty()) {
273 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
274 Metadata metadata =
new Metadata();
276 ParseContext parseContext =
new ParseContext();
277 parseContext.set(Parser.class, parser);
281 ContentHandler contentHandler =
new BodyContentHandler(-1);
285 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
286 officeParserConfig.setUseSAXPptxExtractor(
true);
287 officeParserConfig.setUseSAXDocxExtractor(
true);
288 parseContext.set(OfficeParserConfig.class, officeParserConfig);
289 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
290 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
291 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
292 if (checkForIngestCancellation(abstractFile)) {
296 parser.parse(stream, contentHandler, metadata, parseContext);
297 }
catch (IOException | SAXException | TikaException ex) {
298 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
302 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
313 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
314 List<Picture> listOfAllPictures;
317 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
318 PicturesTable pictureTable = doc.getPicturesTable();
319 listOfAllPictures = pictureTable.getAllPictures();
320 }
catch (Exception ex) {
337 LOGGER.log(Level.WARNING,
"Word document container could not be initialized. Reason: {0}", ex.getMessage());
341 Path outputFolderPath;
342 if (listOfAllPictures.isEmpty()) {
345 outputFolderPath = getOutputFolderPath(this.parentFileName);
347 if (outputFolderPath == null) {
350 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
352 int pictureNumber = 0;
353 for (Picture picture : listOfAllPictures) {
354 if (checkForIngestCancellation(af)) {
357 String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber +
"." + picture.suggestFileExtension();
359 data = picture.getContent();
360 }
catch (Exception ex) {
363 writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
365 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
369 return listOfExtractedImages;
380 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
381 List<HSLFPictureData> listOfAllPictures = null;
384 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
385 listOfAllPictures = ppt.getPictureData();
386 }
catch (Exception ex) {
398 LOGGER.log(Level.WARNING,
"PPT container could not be initialized. Reason: {0}", ex.getMessage());
404 Path outputFolderPath;
405 if (listOfAllPictures.isEmpty()) {
408 outputFolderPath = getOutputFolderPath(this.parentFileName);
410 if (outputFolderPath == null) {
417 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
419 for (HSLFPictureData pictureData : listOfAllPictures) {
420 if (checkForIngestCancellation(af)) {
425 PictureType type = pictureData.getType();
446 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
448 data = pictureData.getData();
449 }
catch (Exception ex) {
452 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
453 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
456 return listOfExtractedImages;
467 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
468 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
471 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
472 listOfAllPictures = xls.getAllPictures();
473 }
catch (Exception ex) {
492 LOGGER.log(Level.WARNING,
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage());
498 Path outputFolderPath;
499 if (listOfAllPictures.isEmpty()) {
502 outputFolderPath = getOutputFolderPath(this.parentFileName);
504 if (outputFolderPath == null) {
509 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
511 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
512 if (checkForIngestCancellation(af)) {
515 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
517 data = pictureData.getData();
518 }
catch (Exception ex) {
521 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
522 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
525 return listOfExtractedImages;
536 private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
537 Path outputDirectory = getOutputFolderPath(parentFileName);
538 if (outputDirectory == null) {
539 return Collections.emptyList();
541 PDFAttachmentExtractor pdfExtractor =
new PDFAttachmentExtractor(parser);
544 Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
545 new ReadContentInputStream(abstractFile), abstractFile.getId(),
549 List<ExtractedFile> extractedFiles =
new ArrayList<>();
550 for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
551 if (checkForIngestCancellation(abstractFile)) {
554 String fileName = pathEntry.getKey();
555 Path writeLocation = pathEntry.getValue().getPath();
556 int fileSize = pathEntry.getValue().getLength();
557 extractedFiles.add(
new ExtractedFile(fileName,
558 getFileRelativePath(writeLocation.getFileName().toString()),
561 return extractedFiles;
562 }
catch (IOException | SAXException | TikaException | InvalidPathException ex) {
563 LOGGER.log(Level.WARNING,
"Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() +
" ID: " + abstractFile.getId(), ex);
565 return Collections.emptyList();
575 private void writeExtractedImage(String outputPath, byte[] data) {
576 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
578 }
catch (IOException ex) {
579 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
593 private Path getOutputFolderPath(String parentFileName) {
594 Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
596 File outputFolder = outputFolderPath.toFile();
597 if (!fileTaskExecutor.exists(outputFolder)) {
598 if (!fileTaskExecutor.mkdirs(outputFolder)) {
599 outputFolderPath = null;
602 return outputFolderPath;
603 }
catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
604 LOGGER.log(Level.SEVERE, String.format(
"Failed to find or create %s", outputFolderPath), ex);
618 private String getFileRelativePath(String fileName) {
619 return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
630 private static String utf8SanitizeFileName(String fileName) {
631 Charset charset = StandardCharsets.UTF_8;
632 return charset.decode(charset.encode(escapeFileName(fileName))).toString();
652 ExtractedFile(String fileName, String localPath,
long size) {
656 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
718 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
721 MediaType contentType = detector.detect(stream, metadata);
723 if (!contentType.getType().equalsIgnoreCase(
"image")
724 && !contentType.getType().equalsIgnoreCase(
"video")
725 && !contentType.getType().equalsIgnoreCase(
"application")
726 && !contentType.getType().equalsIgnoreCase(
"audio")) {
731 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
736 if (nameToExtractedFileMap.containsKey(name)) {
742 name = UNKNOWN_IMAGE_NAME_PREFIX +
fileCount;
747 name = FilenameUtils.normalize(FilenameUtils.getName(name));
749 name = utf8SanitizeFileName(name);
753 if (name.indexOf(
'.') == -1) {
755 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
756 }
catch (MimeTypeException ex) {
757 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
761 Path outputFolderPath = getOutputFolderPath(parentFileName);
762 if (outputFolderPath != null) {
763 File extractedFile =
new File(Paths.get(outputFolderPath.toString(), name).toString());
764 byte[] fileData = IOUtils.toByteArray(stream);
765 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
766 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
776 return new ArrayList<>(nameToExtractedFileMap.values());
FileManager getFileManager()
String getMIMEType(AbstractFile file)
void addFilesToJob(List< AbstractFile > files)
boolean fileIngestIsCancelled()
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
static synchronized IngestServices getInstance()