19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.InvalidPathException;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
35 import java.util.logging.Level;
36 import org.apache.commons.io.FilenameUtils;
37 import org.apache.commons.io.IOUtils;
38 import org.apache.poi.hwpf.usermodel.Picture;
39 import org.apache.poi.hslf.usermodel.HSLFPictureData;
40 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
41 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
42 import org.apache.poi.hwpf.HWPFDocument;
43 import org.apache.poi.hwpf.model.PicturesTable;
44 import org.apache.poi.sl.usermodel.PictureData.PictureType;
45 import org.apache.poi.ss.usermodel.Workbook;
46 import org.apache.tika.config.TikaConfig;
47 import org.apache.tika.detect.Detector;
48 import org.apache.tika.exception.TikaException;
49 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
50 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
51 import org.apache.tika.metadata.Metadata;
52 import org.apache.tika.mime.MediaType;
53 import org.apache.tika.mime.MimeTypeException;
54 import org.apache.tika.parser.AutoDetectParser;
55 import org.apache.tika.parser.ParseContext;
56 import org.apache.tika.parser.Parser;
57 import org.apache.tika.parser.microsoft.OfficeParserConfig;
58 import org.apache.tika.sax.BodyContentHandler;
59 import org.openide.util.NbBundle;
75 import org.xml.sax.ContentHandler;
76 import org.xml.sax.SAXException;
82 class DocumentEmbeddedContentExtractor {
86 private static final Logger LOGGER =
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
88 private String parentFileName;
89 private final String UNKNOWN_IMAGE_NAME_PREFIX =
"image_";
91 private final FileTaskExecutor fileTaskExecutor;
93 private String moduleDirRelative;
94 private String moduleDirAbsolute;
96 private AutoDetectParser parser =
new AutoDetectParser();
97 private Detector detector = parser.getDetector();
98 private TikaConfig config = TikaConfig.getDefaultConfig();
103 enum SupportedExtractionFormats {
105 DOC(
"application/msword"),
106 DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
107 PPT(
"application/vnd.ms-powerpoint"),
108 PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
109 XLS(
"application/vnd.ms-excel"),
110 XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
111 PDF(
"application/pdf");
113 private final String mimeType;
115 SupportedExtractionFormats(
final String mimeType) {
116 this.mimeType = mimeType;
120 public String toString() {
121 return this.mimeType;
124 private SupportedExtractionFormats abstractFileExtractionFormat;
130 this.context = context;
131 this.fileTypeDetector = fileTypeDetector;
132 this.moduleDirRelative = moduleDirRelative;
133 this.moduleDirAbsolute = moduleDirAbsolute;
134 this.fileTaskExecutor = fileTaskExecutor;
146 boolean isContentExtractionSupported(AbstractFile abstractFile) {
147 String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
148 for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
149 if (s.toString().equals(abstractFileMimeType)) {
150 abstractFileExtractionFormat = s;
166 void extractEmbeddedContent(AbstractFile abstractFile) {
167 List<ExtractedFile> listOfExtractedImages = null;
168 List<AbstractFile> listOfExtractedImageAbstractFiles = null;
181 if (abstractFile.hasChildren()) {
183 File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
184 if (fileTaskExecutor.exists(outputFolder)) {
188 }
catch (TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
189 LOGGER.log(Level.SEVERE, String.format(
"Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e);
194 switch (abstractFileExtractionFormat) {
198 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
201 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
204 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
207 listOfExtractedImages = extractImagesFromXls(abstractFile);
210 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
216 if (listOfExtractedImages == null) {
220 listOfExtractedImageAbstractFiles =
new ArrayList<>();
221 for (ExtractedFile extractedImage : listOfExtractedImages) {
223 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
224 extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
226 }
catch (TskCoreException ex) {
227 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(),
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex);
230 if (!listOfExtractedImages.isEmpty()) {
245 private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
246 Metadata metadata =
new Metadata();
248 ParseContext parseContext =
new ParseContext();
249 parseContext.set(Parser.class, parser);
253 ContentHandler contentHandler =
new BodyContentHandler(-1);
257 OfficeParserConfig officeParserConfig =
new OfficeParserConfig();
258 officeParserConfig.setUseSAXPptxExtractor(
true);
259 officeParserConfig.setUseSAXDocxExtractor(
true);
260 parseContext.set(OfficeParserConfig.class, officeParserConfig);
262 EmbeddedDocumentExtractor extractor =
new EmbeddedContentExtractor(parseContext);
263 parseContext.set(EmbeddedDocumentExtractor.class, extractor);
264 ReadContentInputStream stream =
new ReadContentInputStream(abstractFile);
267 parser.parse(stream, contentHandler, metadata, parseContext);
268 }
catch (IOException | SAXException | TikaException ex) {
269 LOGGER.log(Level.WARNING,
"Error while parsing file, skipping: " + abstractFile.getName(), ex);
273 return ((EmbeddedContentExtractor) extractor).getExtractedImages();
284 private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
285 List<Picture> listOfAllPictures;
288 HWPFDocument doc =
new HWPFDocument(
new ReadContentInputStream(af));
289 PicturesTable pictureTable = doc.getPicturesTable();
290 listOfAllPictures = pictureTable.getAllPictures();
291 }
catch (Exception ex) {
308 LOGGER.log(Level.WARNING,
"Word document container could not be initialized. Reason: {0}", ex.getMessage());
312 Path outputFolderPath;
313 if (listOfAllPictures.isEmpty()) {
316 outputFolderPath = getOutputFolderPath(this.parentFileName);
318 if (outputFolderPath == null) {
321 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
323 int pictureNumber = 0;
324 for (Picture picture : listOfAllPictures) {
325 String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber +
"." + picture.suggestFileExtension();
327 data = picture.getContent();
328 }
catch (Exception ex) {
331 writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
333 listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
337 return listOfExtractedImages;
348 private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
349 List<HSLFPictureData> listOfAllPictures = null;
352 HSLFSlideShow ppt =
new HSLFSlideShow(
new ReadContentInputStream(af));
353 listOfAllPictures = ppt.getPictureData();
354 }
catch (Exception ex) {
366 LOGGER.log(Level.WARNING,
"PPT container could not be initialized. Reason: {0}", ex.getMessage());
372 Path outputFolderPath;
373 if (listOfAllPictures.isEmpty()) {
376 outputFolderPath = getOutputFolderPath(this.parentFileName);
378 if (outputFolderPath == null) {
385 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
387 for (HSLFPictureData pictureData : listOfAllPictures) {
391 PictureType type = pictureData.getType();
412 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext;
414 data = pictureData.getData();
415 }
catch (Exception ex) {
418 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
419 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
422 return listOfExtractedImages;
433 private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
434 List<? extends
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
437 Workbook xls =
new HSSFWorkbook(
new ReadContentInputStream(af));
438 listOfAllPictures = xls.getAllPictures();
439 }
catch (Exception ex) {
458 LOGGER.log(Level.WARNING,
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage());
464 Path outputFolderPath;
465 if (listOfAllPictures.isEmpty()) {
468 outputFolderPath = getOutputFolderPath(this.parentFileName);
470 if (outputFolderPath == null) {
475 List<ExtractedFile> listOfExtractedImages =
new ArrayList<>();
477 for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
478 String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i +
"." + pictureData.suggestFileExtension();
480 data = pictureData.getData();
481 }
catch (Exception ex) {
484 writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
485 listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
488 return listOfExtractedImages;
499 private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
500 Path outputDirectory = getOutputFolderPath(parentFileName);
501 if (outputDirectory == null) {
502 return Collections.emptyList();
504 PDFAttachmentExtractor pdfExtractor =
new PDFAttachmentExtractor(parser);
507 Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
508 new ReadContentInputStream(abstractFile), abstractFile.getId(),
512 List<ExtractedFile> extractedFiles =
new ArrayList<>();
513 extractedAttachments.entrySet().forEach((pathEntry) -> {
514 String fileName = pathEntry.getKey();
515 Path writeLocation = pathEntry.getValue().getPath();
516 int fileSize = pathEntry.getValue().getLength();
517 extractedFiles.add(
new ExtractedFile(fileName,
518 getFileRelativePath(writeLocation.getFileName().toString()),
522 return extractedFiles;
523 }
catch (IOException | SAXException | TikaException | InvalidPathException ex) {
524 LOGGER.log(Level.WARNING,
"Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() +
" ID: " + abstractFile.getId(), ex);
526 return Collections.emptyList();
536 private void writeExtractedImage(String outputPath, byte[] data) {
537 try (EncodedFileOutputStream fos =
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
539 }
catch (IOException ex) {
540 LOGGER.log(Level.WARNING,
"Could not write to the provided location: " + outputPath, ex);
554 private Path getOutputFolderPath(String parentFileName) {
555 Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
557 File outputFolder = outputFolderPath.toFile();
558 if (!fileTaskExecutor.exists(outputFolder)) {
559 if (!fileTaskExecutor.mkdirs(outputFolder)) {
560 outputFolderPath = null;
563 return outputFolderPath;
564 }
catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
565 LOGGER.log(Level.SEVERE, String.format(
"Failed to find or create %s", outputFolderPath), ex);
579 private String getFileRelativePath(String fileName) {
580 return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
591 private static String utf8SanitizeFileName(String fileName) {
592 Charset charset = StandardCharsets.UTF_8;
593 return charset.decode(charset.encode(escapeFileName(fileName))).toString();
613 ExtractedFile(String fileName, String localPath,
long size) {
617 ExtractedFile(String fileName, String localPath,
long size,
long ctime,
long crtime,
long atime,
long mtime) {
679 Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
682 MediaType contentType = detector.detect(stream, metadata);
684 if (!contentType.getType().equalsIgnoreCase(
"image")
685 && !contentType.getType().equalsIgnoreCase(
"video")
686 && !contentType.getType().equalsIgnoreCase(
"application")
687 && !contentType.getType().equalsIgnoreCase(
"audio")) {
692 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
697 if (nameToExtractedFileMap.containsKey(name)) {
703 name = UNKNOWN_IMAGE_NAME_PREFIX +
fileCount;
708 name = FilenameUtils.normalize(FilenameUtils.getName(name));
710 name = utf8SanitizeFileName(name);
714 if (name.indexOf(
'.') == -1) {
716 name += config.getMimeRepository().forName(contentType.toString()).getExtension();
717 }
catch (MimeTypeException ex) {
718 LOGGER.log(Level.WARNING,
"Failed to get suggested extension for the following type: " + contentType.toString(), ex);
722 Path outputFolderPath = getOutputFolderPath(parentFileName);
723 if (outputFolderPath != null) {
724 File extractedFile =
new File(Paths.get(outputFolderPath.toString(), name).toString());
725 byte[] fileData = IOUtils.toByteArray(stream);
726 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
727 nameToExtractedFileMap.put(name,
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
737 return new ArrayList<>(nameToExtractedFileMap.values());
FileManager getFileManager()
String getMIMEType(AbstractFile file)
synchronized DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
void addFilesToJob(List< AbstractFile > files)
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
synchronized static Logger getLogger(String name)
static Case getCurrentCaseThrows()
static synchronized IngestServices getInstance()