19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 
   22 import java.io.FileOutputStream;
 
   23 import java.io.IOException;
 
   24 import java.io.InputStream;
 
   25 import java.nio.file.Path;
 
   26 import java.nio.file.Paths;
 
   27 import java.util.ArrayList;
 
   28 import java.util.Collections;
 
   29 import java.util.HashMap;
 
   30 import java.util.List;
 
   32 import java.util.logging.Level;
 
   33 import org.apache.commons.io.FilenameUtils;
 
   34 import org.apache.commons.io.IOUtils;
 
   35 import org.apache.poi.hwpf.usermodel.Picture;
 
   36 import org.apache.poi.hslf.usermodel.HSLFPictureData;
 
   37 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 
   38 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 
   39 import org.apache.poi.hwpf.HWPFDocument;
 
   40 import org.apache.poi.hwpf.model.PicturesTable;
 
   41 import org.apache.poi.sl.usermodel.PictureData.PictureType;
 
   42 import org.apache.poi.ss.usermodel.Workbook;
 
   43 import org.apache.tika.config.TikaConfig;
 
   44 import org.apache.tika.detect.Detector;
 
   45 import org.apache.tika.exception.TikaException;
 
   46 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 
   47 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 
   48 import org.apache.tika.metadata.Metadata;
 
   49 import org.apache.tika.mime.MediaType;
 
   50 import org.apache.tika.mime.MimeTypeException;
 
   51 import org.apache.tika.parser.AutoDetectParser;
 
   52 import org.apache.tika.parser.ParseContext;
 
   53 import org.apache.tika.parser.Parser;
 
   54 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
   55 import org.apache.tika.sax.BodyContentHandler;
 
   56 import org.openide.util.NbBundle;
 
   70 import org.xml.sax.ContentHandler;
 
   71 import org.xml.sax.SAXException;
 
   77 class DocumentEmbeddedContentExtractor {
 
   81     private static final Logger LOGGER = 
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
 
   83     private String parentFileName;
 
   84     private final String UNKNOWN_IMAGE_NAME_PREFIX = 
"image_"; 
 
   87     private String moduleDirRelative;
 
   88     private String moduleDirAbsolute;
 
   90     private AutoDetectParser parser = 
new AutoDetectParser();
 
   91     private Detector detector = parser.getDetector();
 
   92     private TikaConfig config = TikaConfig.getDefaultConfig();
 
   97     enum SupportedExtractionFormats {
 
   99         DOC(
"application/msword"), 
 
  100         DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), 
 
  101         PPT(
"application/vnd.ms-powerpoint"), 
 
  102         PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"), 
 
  103         XLS(
"application/vnd.ms-excel"), 
 
  104         XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), 
 
  105         PDF(
"application/pdf"); 
 
  107         private final String mimeType;
 
  109         SupportedExtractionFormats(
final String mimeType) {
 
  110             this.mimeType = mimeType;
 
  114         public String toString() {
 
  115             return this.mimeType;
 
  118     private SupportedExtractionFormats abstractFileExtractionFormat;
 
  124         this.context = context;
 
  125         this.fileTypeDetector = fileTypeDetector;
 
  126         this.moduleDirRelative = moduleDirRelative;
 
  127         this.moduleDirAbsolute = moduleDirAbsolute;
 
  139     boolean isContentExtractionSupported(AbstractFile abstractFile) {
 
  140         String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
 
  141         for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
 
  142             if (s.toString().equals(abstractFileMimeType)) {
 
  143                 abstractFileExtractionFormat = s;
 
  159     void extractEmbeddedContent(AbstractFile abstractFile) {
 
  160         List<ExtractedFile> listOfExtractedImages = null;
 
  161         List<AbstractFile> listOfExtractedImageAbstractFiles = null;
 
  166             if (abstractFile.hasChildren()) {
 
  168                 if (
new File(getOutputFolderPath(parentFileName)).exists()) {
 
  169                     LOGGER.log(Level.INFO, 
"File already has been processed as it has children and local unpacked file, skipping: {0}", abstractFile.getName()); 
 
  173         } 
catch (TskCoreException e) {
 
  174             LOGGER.log(Level.SEVERE, String.format(
"Error checking if file already has been processed, skipping: %s", parentFileName), e); 
 
  179         switch (abstractFileExtractionFormat) {
 
  183                 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
 
  186                 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
 
  189                 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
 
  192                 listOfExtractedImages = extractImagesFromXls(abstractFile);
 
  195                 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
 
  201         if (listOfExtractedImages == null) {
 
  205         listOfExtractedImageAbstractFiles = 
new ArrayList<>();
 
  206         for (ExtractedFile extractedImage : listOfExtractedImages) {
 
  208                 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
 
  209                         extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
 
  211             } 
catch (TskCoreException ex) {
 
  212                 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(), 
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); 
 
  215         if (!listOfExtractedImages.isEmpty()) {
 
  230     private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
 
  231         Metadata metadata = 
new Metadata();
 
  233         ParseContext parseContext = 
new ParseContext();
 
  234         parseContext.set(Parser.class, parser);
 
  238         ContentHandler contentHandler = 
new BodyContentHandler(-1);
 
  242         OfficeParserConfig officeParserConfig = 
new OfficeParserConfig();
 
  243         officeParserConfig.setUseSAXPptxExtractor(
true);
 
  244         officeParserConfig.setUseSAXDocxExtractor(
true);
 
  245         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
  247         EmbeddedDocumentExtractor extractor = 
new EmbeddedContentExtractor(parseContext);
 
  248         parseContext.set(EmbeddedDocumentExtractor.class, extractor);
 
  249         ReadContentInputStream stream = 
new ReadContentInputStream(abstractFile);
 
  252             parser.parse(stream, contentHandler, metadata, parseContext);
 
  253         } 
catch (IOException | SAXException | TikaException ex) {
 
  254             LOGGER.log(Level.WARNING, 
"Error while parsing file, skipping: " + abstractFile.getName(), ex); 
 
  258         return ((EmbeddedContentExtractor) extractor).getExtractedImages();
 
  269     private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
 
  270         List<Picture> listOfAllPictures;
 
  273             HWPFDocument doc = 
new HWPFDocument(
new ReadContentInputStream(af));
 
  274             PicturesTable pictureTable = doc.getPicturesTable();
 
  275             listOfAllPictures = pictureTable.getAllPictures();
 
  276         } 
catch (Exception ex) {
 
  294             LOGGER.log(Level.WARNING, 
"Word document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  298         String outputFolderPath;
 
  299         if (listOfAllPictures.isEmpty()) {
 
  302             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  304         if (outputFolderPath == null) {
 
  307         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  309         int pictureNumber = 0; 
 
  310         for (Picture picture : listOfAllPictures) {
 
  311             String fileName =  UNKNOWN_IMAGE_NAME_PREFIX +pictureNumber +
"."+ picture.suggestFileExtension();
 
  313                 data = picture.getContent();
 
  314             } 
catch (Exception ex) {
 
  317             writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
 
  319             listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
 
  323         return listOfExtractedImages;
 
  334     private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
 
  335         List<HSLFPictureData> listOfAllPictures = null;
 
  338             HSLFSlideShow ppt = 
new HSLFSlideShow(
new ReadContentInputStream(af));
 
  339             listOfAllPictures = ppt.getPictureData();
 
  340         } 
catch (Exception ex) {
 
  352             LOGGER.log(Level.WARNING, 
"PPT container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  358         String outputFolderPath;
 
  359         if (listOfAllPictures.isEmpty()) {
 
  362             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  364         if (outputFolderPath == null) {
 
  371         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  373         for (HSLFPictureData pictureData : listOfAllPictures) {
 
  377             PictureType type = pictureData.getType();
 
  398             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; 
 
  400                 data = pictureData.getData();
 
  401             } 
catch (Exception ex) {
 
  404             writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
 
  405             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  408         return listOfExtractedImages;
 
  419     private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
 
  420         List<? extends 
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
 
  423             Workbook xls = 
new HSSFWorkbook(
new ReadContentInputStream(af));
 
  424             listOfAllPictures = xls.getAllPictures();
 
  425         } 
catch (Exception ex) {
 
  444             LOGGER.log(Level.WARNING, 
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  450         String outputFolderPath;
 
  451         if (listOfAllPictures.isEmpty()) {
 
  454             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  456         if (outputFolderPath == null) {
 
  461         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  463         for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
 
  464             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + 
"." + pictureData.suggestFileExtension(); 
 
  466                 data = pictureData.getData();
 
  467             } 
catch (Exception ex) {
 
  470             writeExtractedImage(Paths.get(outputFolderPath, imageName).toString(), data);
 
  471             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  474         return listOfExtractedImages;
 
  484     private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
 
  485         PDFAttachmentExtractor pdfExtractor = 
new PDFAttachmentExtractor(parser);
 
  487             Path outputDirectory = Paths.get(getOutputFolderPath(parentFileName));
 
  489             Map<String, Path> extractedAttachments = pdfExtractor.extract(
 
  490                     new ReadContentInputStream(abstractFile), abstractFile.getId(),
 
  494             List<ExtractedFile> extractedFiles = 
new ArrayList<>();
 
  495             extractedAttachments.entrySet().forEach((pathEntry) -> {
 
  496                 String fileName = pathEntry.getKey();
 
  497                 Path writeLocation = pathEntry.getValue();
 
  498                 extractedFiles.add(
new ExtractedFile(fileName,
 
  499                         getFileRelativePath(writeLocation.getFileName().toString()), 
 
  500                         writeLocation.toFile().length()));
 
  503             return extractedFiles;
 
  504         } 
catch (IOException | SAXException | TikaException ex) {
 
  505             LOGGER.log(Level.WARNING, 
"Error attempting to extract attachments from PDFs", ex); 
 
  507         return Collections.emptyList();
 
  517     private void writeExtractedImage(String outputPath, byte[] data) {
 
  518         try (EncodedFileOutputStream fos = 
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
 
  520         } 
catch (IOException ex) {
 
  521             LOGGER.log(Level.WARNING, 
"Could not write to the provided location: " + outputPath, ex); 
 
  533     private String getOutputFolderPath(String parentFileName) {
 
  534         String outputFolderPath = moduleDirAbsolute + File.separator + parentFileName;
 
  535         File outputFilePath = 
new File(outputFolderPath);
 
  536         if (!outputFilePath.exists()) {
 
  538                 outputFilePath.mkdirs();
 
  539             } 
catch (SecurityException ex) {
 
  540                 LOGGER.log(Level.WARNING, NbBundle.getMessage(
this.getClass(), 
"EmbeddedFileExtractorIngestModule.ImageExtractor.getOutputFolderPath.exception.msg", parentFileName), ex);
 
  544         return outputFolderPath;
 
  556     private String getFileRelativePath(String fileName) {
 
  557         return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
 
  577         ExtractedFile(String fileName, String localPath, 
long size) {
 
  581         ExtractedFile(String fileName, String localPath, 
long size, 
long ctime, 
long crtime, 
long atime, 
long mtime) {
 
  643                 Metadata metadata, 
boolean outputHtml) 
throws SAXException, IOException {
 
  646             MediaType contentType = detector.detect(stream, metadata);
 
  648             if (!contentType.getType().equalsIgnoreCase(
"image") 
 
  649                     && !contentType.getType().equalsIgnoreCase(
"video") 
 
  650                     && !contentType.getType().equalsIgnoreCase(
"application") 
 
  651                     && !contentType.getType().equalsIgnoreCase(
"audio")) { 
 
  656             String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
 
  661             if (nameToExtractedFileMap.containsKey(name)) {
 
  666                 name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount++;
 
  671                 name = FilenameUtils.normalize(FilenameUtils.getName(name));
 
  675             if (name.indexOf(
'.') == -1) {
 
  677                     name += config.getMimeRepository().forName(contentType.toString()).getExtension();
 
  678                 } 
catch (MimeTypeException ex) {
 
  679                     LOGGER.log(Level.WARNING, 
"Failed to get suggested extension for the following type: " + contentType.toString(), ex); 
 
  683             File extractedFile = 
new File(Paths.get(getOutputFolderPath(parentFileName), name).toString());
 
  684             byte[] fileData = IOUtils.toByteArray(stream);
 
  685             writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
 
  686             nameToExtractedFileMap.put(name, 
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
 
  695             return new ArrayList<>(nameToExtractedFileMap.values());
 
FileManager getFileManager()
 
String getMIMEType(AbstractFile file)
 
synchronized DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
 
void addFilesToJob(List< AbstractFile > files)
 
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
 
synchronized static Logger getLogger(String name)
 
static Case getCurrentCaseThrows()
 
static synchronized IngestServices getInstance()