19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.InvalidPathException;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
35 import java.util.Map.Entry;
36 import java.util.logging.Level;
37 import org.apache.commons.io.FilenameUtils;
38 import org.apache.commons.io.IOUtils;
39 import org.apache.poi.hwpf.usermodel.Picture;
40 import org.apache.poi.hslf.usermodel.HSLFPictureData;
41 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
42 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
43 import org.apache.poi.hwpf.HWPFDocument;
44 import org.apache.poi.hwpf.model.PicturesTable;
45 import org.apache.poi.sl.usermodel.PictureData.PictureType;
46 import org.apache.poi.ss.usermodel.Workbook;
47 import org.apache.tika.config.TikaConfig;
48 import org.apache.tika.detect.Detector;
49 import org.apache.tika.exception.TikaException;
50 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
51 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
52 import org.apache.tika.metadata.Metadata;
53 import org.apache.tika.metadata.TikaCoreProperties;
54 import org.apache.tika.mime.MediaType;
55 import org.apache.tika.mime.MimeTypeException;
56 import org.apache.tika.parser.AutoDetectParser;
57 import org.apache.tika.parser.ParseContext;
58 import org.apache.tika.parser.Parser;
59 import org.apache.tika.parser.microsoft.OfficeParserConfig;
60 import org.apache.tika.sax.BodyContentHandler;
61 import org.openide.util.NbBundle;
77 import org.xml.sax.ContentHandler;
78 import org.xml.sax.SAXException;
84 class DocumentEmbeddedContentExtractor {
88 private static final Logger LOGGER =
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
90 private String parentFileName;
91 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
93 private final FileTaskExecutor fileTaskExecutor;
95 private String moduleDirRelative;
96 private String moduleDirAbsolute;
98 private AutoDetectParser parser =
new AutoDetectParser();
99 private Detector detector = parser.getDetector();
100 private TikaConfig config = TikaConfig.getDefaultConfig();
105 enum SupportedExtractionFormats {
107 DOC(
"application/msword"),
108 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
109 PPT(
"application/vnd.ms-powerpoint"),
110 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
111 XLS(
"application/vnd.ms-excel"),
112 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
113 PDF(
"application/pdf");
115 private final String mimeType;
117 SupportedExtractionFormats(
final String mimeType) {
118 this.mimeType = mimeType;
122 public String toString() {
123 return this.mimeType;
126 private SupportedExtractionFormats abstractFileExtractionFormat;
132 this.context = context;
133 this.fileTypeDetector = fileTypeDetector;
134 this.moduleDirRelative = moduleDirRelative;
135 this.moduleDirAbsolute = moduleDirAbsolute;
136 this.fileTaskExecutor = fileTaskExecutor;
148 boolean isContentExtractionSupported(
AbstractFile abstractFile) {
149 String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
150 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
151 if (checkForIngestCancellation(abstractFile)) {
154 if (s.toString().equals(abstractFileMimeType)) {
155 abstractFileExtractionFormat = s;
173 private boolean checkForIngestCancellation(
AbstractFile file) {
175 LOGGER.log(Level.INFO,
"Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}",
new Object[]{file.getName(), file.getId()});
190 void extractEmbeddedContent(
AbstractFile abstractFile) {
191 List<ExtractedFile> listOfExtractedImages = null;
192 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
207 File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
208 if (fileTaskExecutor.exists(outputFolder)) {
212 }
catch (
TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
213 LOGGER.log(Level.SEVERE, String.format(
"Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.
getName(), abstractFile.
getId()), e);
216 if (checkForIngestCancellation(abstractFile)) {
220 switch (abstractFileExtractionFormat) {
224 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
227 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
230 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
233 listOfExtractedImages = extractImagesFromXls(abstractFile);
236 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
242 if (listOfExtractedImages == null) {
246 listOfExtractedImageAbstractFiles =
new ArrayList<>();
247 for (ExtractedFile extractedImage : listOfExtractedImages) {
248 if (checkForIngestCancellation(abstractFile)) {
252 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
253 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
256 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
259 if (!listOfExtractedImages.isEmpty()) {
274 private List<ExtractedFile> extractEmbeddedContentFromOOXML(
AbstractFile abstractFile) {
275 Metadata metadata =
new Metadata();
277 ParseContext parseContext =
new ParseContext();
278 parseContext.set(Parser.class, parser);
282 ContentHandler contentHandler =
new BodyContentHandler(-1);
286 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
287 officeParserConfig.setUseSAXPptxExtractor(
true);
288 officeParserConfig.setUseSAXDocxExtractor(
true);
289 parseContext.set(OfficeParserConfig.class, officeParserConfig);
290 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
291 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
293 if (checkForIngestCancellation(abstractFile)) {
297 parser.parse(stream, contentHandler, metadata, parseContext);
298 }
catch (IOException | SAXException | TikaException ex) {
299 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.
getName(), ex);
303 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
314 private List<ExtractedFile> extractEmbeddedImagesFromDoc(
AbstractFile af) {
315 List<Picture> listOfAllPictures;
319 PicturesTable pictureTable = doc.getPicturesTable();
320 listOfAllPictures = pictureTable.getAllPictures();
321 }
catch (Exception ex) {
338 LOGGER.log(Level.WARNING,
"Word document container could not be initialized. Reason: {0}", ex.getMessage());
342 Path outputFolderPath;
343 if (listOfAllPictures.isEmpty()) {
346 outputFolderPath = getOutputFolderPath(this.parentFileName);
348 if (outputFolderPath == null) {
351 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
353 int pictureNumber = 0;
354 for (Picture picture : listOfAllPictures) {
355 if (checkForIngestCancellation(af)) {
358 String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber +
"." + picture.suggestFileExtension();
360 data = picture.getContent();
361 }
catch (Exception ex) {
364 writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
366 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
370 return listOfExtractedImages;
381 private List<ExtractedFile> extractEmbeddedImagesFromPpt(
AbstractFile af) {
382 List<HSLFPictureData> listOfAllPictures = null;
386 listOfAllPictures = ppt.getPictureData();
387 }
catch (Exception ex) {
399 LOGGER.log(Level.WARNING,
"PPT container could not be initialized. Reason: {0}", ex.getMessage());
405 Path outputFolderPath;
406 if (listOfAllPictures.isEmpty()) {
409 outputFolderPath = getOutputFolderPath(this.parentFileName);
411 if (outputFolderPath == null) {
418 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
420 for (HSLFPictureData pictureData : listOfAllPictures) {
421 if (checkForIngestCancellation(af)) {
426 PictureType type = pictureData.getType();
447 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
449 data = pictureData.getData();
450 }
catch (Exception ex) {
453 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
454 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
457 return listOfExtractedImages;
468 private List<ExtractedFile> extractImagesFromXls(
AbstractFile af) {
469 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
473 listOfAllPictures = xls.getAllPictures();
474 }
catch (Exception ex) {
493 LOGGER.log(Level.WARNING,
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage());
499 Path outputFolderPath;
500 if (listOfAllPictures.isEmpty()) {
503 outputFolderPath = getOutputFolderPath(this.parentFileName);
505 if (outputFolderPath == null) {
510 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
512 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
513 if (checkForIngestCancellation(af)) {
516 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
518 data = pictureData.getData();
519 }
catch (Exception ex) {
522 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
523 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
526 return listOfExtractedImages;
537 private List<ExtractedFile> extractEmbeddedContentFromPDF(
AbstractFile abstractFile) {
538 Path outputDirectory = getOutputFolderPath(parentFileName);
539 if (outputDirectory == null) {
540 return Collections.emptyList();
542 PDFAttachmentExtractor pdfExtractor =
new PDFAttachmentExtractor(parser);
545 Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
550 List<ExtractedFile> extractedFiles =
new ArrayList<>();
551 for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
552 if (checkForIngestCancellation(abstractFile)) {
555 String fileName = pathEntry.getKey();
556 Path writeLocation = pathEntry.getValue().getPath();
557 int fileSize = pathEntry.getValue().getLength();
558 extractedFiles.add(
new ExtractedFile(fileName,
559 getFileRelativePath(writeLocation.getFileName().toString()),
562 return extractedFiles;
563 }
catch (IOException | SAXException | TikaException | InvalidPathException ex) {
564 LOGGER.log(Level.WARNING,
"Error attempting to extract attachments from PDFs for file Name: " + abstractFile.
getName() +
" ID: " + abstractFile.
getId(), ex);
566 return Collections.emptyList();
576 private void writeExtractedImage(String outputPath, byte[] data) {
579 }
catch (IOException ex) {
580 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
594 private Path getOutputFolderPath(String parentFileName) {
595 Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
597 File outputFolder = outputFolderPath.toFile();
598 if (!fileTaskExecutor.exists(outputFolder)) {
599 if (!fileTaskExecutor.mkdirs(outputFolder)) {
600 outputFolderPath = null;
603 return outputFolderPath;
604 }
catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
605 LOGGER.log(Level.SEVERE, String.format(
"Failed to find or create %s", outputFolderPath), ex);
619 private String getFileRelativePath(String fileName) {
620 return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
631 private static String utf8SanitizeFileName(String fileName) {
632 Charset charset = StandardCharsets.UTF_8;
633 return charset.decode(charset.encode(escapeFileName(fileName))).toString();
653 ExtractedFile(String fileName, String localPath,
long size) {
657 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
719 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
722 MediaType contentType = detector.detect(stream, metadata);
724 if (!contentType.getType().equalsIgnoreCase(
"image")
725 && !contentType.getType().equalsIgnoreCase(
"video")
726 && !contentType.getType().equalsIgnoreCase(
"application")
727 && !contentType.getType().equalsIgnoreCase(
"audio")) {
732 String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
737 if (nameToExtractedFileMap.containsKey(name)) {
743 name = UNKNOWN_IMAGE_NAME_PREFIX +
fileCount;
748 name = FilenameUtils.normalize(FilenameUtils.getName(name));
750 name = utf8SanitizeFileName(name);
754 if (name.indexOf(
'.') == -1) {
756 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
757 }
catch (MimeTypeException ex) {
758 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
762 Path outputFolderPath = getOutputFolderPath(parentFileName);
763 if (outputFolderPath != null) {
764 File extractedFile =
new File(Paths.get(outputFolderPath.toString(), name).toString());
765 byte[] fileData = IOUtils.toByteArray(stream);
766 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
767 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
777 return new ArrayList<>(nameToExtractedFileMap.values());
FileManager getFileManager()
String getMIMEType(AbstractFile file)
void addFilesToJob(List< AbstractFile > files)
boolean fileIngestIsCancelled()
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
static synchronized IngestServices getInstance()