19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
 
   22 import java.io.FileOutputStream;
 
   23 import java.io.IOException;
 
   24 import java.io.InputStream;
 
   25 import java.nio.charset.Charset;
 
   26 import java.nio.charset.StandardCharsets;
 
   27 import java.nio.file.InvalidPathException;
 
   28 import java.nio.file.Path;
 
   29 import java.nio.file.Paths;
 
   30 import java.util.ArrayList;
 
   31 import java.util.Collections;
 
   32 import java.util.HashMap;
 
   33 import java.util.List;
 
   35 import java.util.Map.Entry;
 
   36 import java.util.logging.Level;
 
   37 import org.apache.commons.io.FilenameUtils;
 
   38 import org.apache.commons.io.IOUtils;
 
   39 import org.apache.poi.hwpf.usermodel.Picture;
 
   40 import org.apache.poi.hslf.usermodel.HSLFPictureData;
 
   41 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 
   42 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 
   43 import org.apache.poi.hwpf.HWPFDocument;
 
   44 import org.apache.poi.hwpf.model.PicturesTable;
 
   45 import org.apache.poi.sl.usermodel.PictureData.PictureType;
 
   46 import org.apache.poi.ss.usermodel.Workbook;
 
   47 import org.apache.tika.config.TikaConfig;
 
   48 import org.apache.tika.detect.Detector;
 
   49 import org.apache.tika.exception.TikaException;
 
   50 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 
   51 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 
   52 import org.apache.tika.metadata.Metadata;
 
   53 import org.apache.tika.mime.MediaType;
 
   54 import org.apache.tika.mime.MimeTypeException;
 
   55 import org.apache.tika.parser.AutoDetectParser;
 
   56 import org.apache.tika.parser.ParseContext;
 
   57 import org.apache.tika.parser.Parser;
 
   58 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
   59 import org.apache.tika.sax.BodyContentHandler;
 
   60 import org.openide.util.NbBundle;
 
   76 import org.xml.sax.ContentHandler;
 
   77 import org.xml.sax.SAXException;
 
   83 class DocumentEmbeddedContentExtractor {
 
   87     private static final Logger LOGGER = 
Logger.
getLogger(DocumentEmbeddedContentExtractor.class.getName());
 
   89     private String parentFileName;
 
   90     private final String UNKNOWN_IMAGE_NAME_PREFIX = 
"image_"; 
 
   92     private final FileTaskExecutor fileTaskExecutor;
 
   94     private String moduleDirRelative;
 
   95     private String moduleDirAbsolute;
 
   97     private AutoDetectParser parser = 
new AutoDetectParser();
 
   98     private Detector detector = parser.getDetector();
 
   99     private TikaConfig config = TikaConfig.getDefaultConfig();
 
  104     enum SupportedExtractionFormats {
 
  106         DOC(
"application/msword"), 
 
  107         DOCX(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), 
 
  108         PPT(
"application/vnd.ms-powerpoint"), 
 
  109         PPTX(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"), 
 
  110         XLS(
"application/vnd.ms-excel"), 
 
  111         XLSX(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), 
 
  112         PDF(
"application/pdf"); 
 
  114         private final String mimeType;
 
  116         SupportedExtractionFormats(
final String mimeType) {
 
  117             this.mimeType = mimeType;
 
  121         public String toString() {
 
  122             return this.mimeType;
 
  125     private SupportedExtractionFormats abstractFileExtractionFormat;
 
  131         this.context = context;
 
  132         this.fileTypeDetector = fileTypeDetector;
 
  133         this.moduleDirRelative = moduleDirRelative;
 
  134         this.moduleDirAbsolute = moduleDirAbsolute;
 
  135         this.fileTaskExecutor = fileTaskExecutor;
 
  147     boolean isContentExtractionSupported(AbstractFile abstractFile) {
 
  148         String abstractFileMimeType = fileTypeDetector.
getMIMEType(abstractFile);
 
  149         for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
 
  150             if (checkForIngestCancellation(abstractFile)) {
 
  153             if (s.toString().equals(abstractFileMimeType)) {
 
  154                 abstractFileExtractionFormat = s;
 
  172     private boolean checkForIngestCancellation(AbstractFile file) {
 
  174             LOGGER.log(Level.INFO, 
"Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}", 
new Object[]{file.getName(), file.getId()});
 
  189     void extractEmbeddedContent(AbstractFile abstractFile) {
 
  190         List<ExtractedFile> listOfExtractedImages = null;
 
  191         List<AbstractFile> listOfExtractedImageAbstractFiles = null;
 
  204             if (abstractFile.hasChildren()) {
 
  206                 File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
 
  207                 if (fileTaskExecutor.exists(outputFolder)) {
 
  211         } 
catch (TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
 
  212             LOGGER.log(Level.SEVERE, String.format(
"Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e); 
 
  215         if (checkForIngestCancellation(abstractFile)) {
 
  219         switch (abstractFileExtractionFormat) {
 
  223                 listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
 
  226                 listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
 
  229                 listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
 
  232                 listOfExtractedImages = extractImagesFromXls(abstractFile);
 
  235                 listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
 
  241         if (listOfExtractedImages == null) {
 
  245         listOfExtractedImageAbstractFiles = 
new ArrayList<>();
 
  246         for (ExtractedFile extractedImage : listOfExtractedImages) {
 
  247             if (checkForIngestCancellation(abstractFile)) {
 
  251                 listOfExtractedImageAbstractFiles.add(fileManager.
addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
 
  252                         extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
 
  254             } 
catch (TskCoreException ex) {
 
  255                 LOGGER.log(Level.SEVERE, NbBundle.getMessage(
this.getClass(), 
"EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); 
 
  258         if (!listOfExtractedImages.isEmpty()) {
 
  273     private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
 
  274         Metadata metadata = 
new Metadata();
 
  276         ParseContext parseContext = 
new ParseContext();
 
  277         parseContext.set(Parser.class, parser);
 
  281         ContentHandler contentHandler = 
new BodyContentHandler(-1);
 
  285         OfficeParserConfig officeParserConfig = 
new OfficeParserConfig();
 
  286         officeParserConfig.setUseSAXPptxExtractor(
true);
 
  287         officeParserConfig.setUseSAXDocxExtractor(
true);
 
  288         parseContext.set(OfficeParserConfig.class, officeParserConfig);
 
  289         EmbeddedDocumentExtractor extractor = 
new EmbeddedContentExtractor(parseContext);
 
  290         parseContext.set(EmbeddedDocumentExtractor.class, extractor);
 
  291         ReadContentInputStream stream = 
new ReadContentInputStream(abstractFile);
 
  292         if (checkForIngestCancellation(abstractFile)) {
 
  296             parser.parse(stream, contentHandler, metadata, parseContext);
 
  297         } 
catch (IOException | SAXException | TikaException ex) {
 
  298             LOGGER.log(Level.WARNING, 
"Error while parsing file, skipping: " + abstractFile.getName(), ex); 
 
  302         return ((EmbeddedContentExtractor) extractor).getExtractedImages();
 
  313     private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
 
  314         List<Picture> listOfAllPictures;
 
  317             HWPFDocument doc = 
new HWPFDocument(
new ReadContentInputStream(af));
 
  318             PicturesTable pictureTable = doc.getPicturesTable();
 
  319             listOfAllPictures = pictureTable.getAllPictures();
 
  320         } 
catch (Exception ex) {
 
  337             LOGGER.log(Level.WARNING, 
"Word document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  341         Path outputFolderPath;
 
  342         if (listOfAllPictures.isEmpty()) {
 
  345             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  347         if (outputFolderPath == null) {
 
  350         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  352         int pictureNumber = 0; 
 
  353         for (Picture picture : listOfAllPictures) {
 
  354             if (checkForIngestCancellation(af)) {
 
  357             String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber + 
"." + picture.suggestFileExtension();
 
  359                 data = picture.getContent();
 
  360             } 
catch (Exception ex) {
 
  363             writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
 
  365             listOfExtractedImages.add(
new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
 
  369         return listOfExtractedImages;
 
  380     private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
 
  381         List<HSLFPictureData> listOfAllPictures = null;
 
  384             HSLFSlideShow ppt = 
new HSLFSlideShow(
new ReadContentInputStream(af));
 
  385             listOfAllPictures = ppt.getPictureData();
 
  386         } 
catch (Exception ex) {
 
  398             LOGGER.log(Level.WARNING, 
"PPT container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  404         Path outputFolderPath;
 
  405         if (listOfAllPictures.isEmpty()) {
 
  408             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  410         if (outputFolderPath == null) {
 
  417         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  419         for (HSLFPictureData pictureData : listOfAllPictures) {
 
  420             if (checkForIngestCancellation(af)) {
 
  425             PictureType type = pictureData.getType();
 
  446             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; 
 
  448                 data = pictureData.getData();
 
  449             } 
catch (Exception ex) {
 
  452             writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
 
  453             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  456         return listOfExtractedImages;
 
  467     private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
 
  468         List<? extends 
org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
 
  471             Workbook xls = 
new HSSFWorkbook(
new ReadContentInputStream(af));
 
  472             listOfAllPictures = xls.getAllPictures();
 
  473         } 
catch (Exception ex) {
 
  492             LOGGER.log(Level.WARNING, 
"Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); 
 
  498         Path outputFolderPath;
 
  499         if (listOfAllPictures.isEmpty()) {
 
  502             outputFolderPath = getOutputFolderPath(this.parentFileName);
 
  504         if (outputFolderPath == null) {
 
  509         List<ExtractedFile> listOfExtractedImages = 
new ArrayList<>();
 
  511         for (
org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
 
  512             if (checkForIngestCancellation(af)) {
 
  515             String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + 
"." + pictureData.suggestFileExtension(); 
 
  517                 data = pictureData.getData();
 
  518             } 
catch (Exception ex) {
 
  521             writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
 
  522             listOfExtractedImages.add(
new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
 
  525         return listOfExtractedImages;
 
  536     private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
 
  537         Path outputDirectory = getOutputFolderPath(parentFileName);
 
  538         if (outputDirectory == null) {
 
  539             return Collections.emptyList();
 
  541         PDFAttachmentExtractor pdfExtractor = 
new PDFAttachmentExtractor(parser);
 
  544             Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
 
  545                     new ReadContentInputStream(abstractFile), abstractFile.getId(),
 
  549             List<ExtractedFile> extractedFiles = 
new ArrayList<>();
 
  550             for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
 
  551                 if (checkForIngestCancellation(abstractFile)) {
 
  554                 String fileName = pathEntry.getKey();
 
  555                 Path writeLocation = pathEntry.getValue().getPath();
 
  556                 int fileSize = pathEntry.getValue().getLength();
 
  557                 extractedFiles.add(
new ExtractedFile(fileName,
 
  558                         getFileRelativePath(writeLocation.getFileName().toString()),
 
  561             return extractedFiles;
 
  562         } 
catch (IOException | SAXException | TikaException | InvalidPathException ex) {
 
  563             LOGGER.log(Level.WARNING, 
"Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() + 
" ID: " + abstractFile.getId(), ex); 
 
  565         return Collections.emptyList();
 
  575     private void writeExtractedImage(String outputPath, byte[] data) {
 
  576         try (EncodedFileOutputStream fos = 
new EncodedFileOutputStream(
new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
 
  578         } 
catch (IOException ex) {
 
  579             LOGGER.log(Level.WARNING, 
"Could not write to the provided location: " + outputPath, ex); 
 
  593     private Path getOutputFolderPath(String parentFileName) {
 
  594         Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
 
  596             File outputFolder = outputFolderPath.toFile();
 
  597             if (!fileTaskExecutor.exists(outputFolder)) {
 
  598                 if (!fileTaskExecutor.mkdirs(outputFolder)) {
 
  599                     outputFolderPath = null;
 
  602             return outputFolderPath;
 
  603         } 
catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
 
  604             LOGGER.log(Level.SEVERE, String.format(
"Failed to find or create %s", outputFolderPath), ex);
 
  618     private String getFileRelativePath(String fileName) {
 
  619         return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
 
  630     private static String utf8SanitizeFileName(String fileName) {
 
  631         Charset charset = StandardCharsets.UTF_8;
 
  632         return charset.decode(charset.encode(escapeFileName(fileName))).toString();
 
  652         ExtractedFile(String fileName, String localPath, 
long size) {
 
  656         ExtractedFile(String fileName, String localPath, 
long size, 
long ctime, 
long crtime, 
long atime, 
long mtime) {
 
  718                 Metadata metadata, 
boolean outputHtml) 
throws SAXException, IOException {
 
  721             MediaType contentType = detector.detect(stream, metadata);
 
  723             if (!contentType.getType().equalsIgnoreCase(
"image") 
 
  724                     && !contentType.getType().equalsIgnoreCase(
"video") 
 
  725                     && !contentType.getType().equalsIgnoreCase(
"application") 
 
  726                     && !contentType.getType().equalsIgnoreCase(
"audio")) { 
 
  731             String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
 
  736             if (nameToExtractedFileMap.containsKey(name)) {
 
  742                 name = UNKNOWN_IMAGE_NAME_PREFIX + 
fileCount;
 
  747                 name = FilenameUtils.normalize(FilenameUtils.getName(name));
 
  749                 name = utf8SanitizeFileName(name);
 
  753             if (name.indexOf(
'.') == -1) {
 
  755                     name += config.getMimeRepository().forName(contentType.toString()).getExtension();
 
  756                 } 
catch (MimeTypeException ex) {
 
  757                     LOGGER.log(Level.WARNING, 
"Failed to get suggested extension for the following type: " + contentType.toString(), ex); 
 
  761             Path outputFolderPath = getOutputFolderPath(parentFileName);
 
  762             if (outputFolderPath != null) {
 
  763                 File extractedFile = 
new File(Paths.get(outputFolderPath.toString(), name).toString());
 
  764                 byte[] fileData = IOUtils.toByteArray(stream);
 
  765                 writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
 
  766                 nameToExtractedFileMap.put(name, 
new ExtractedFile(name, getFileRelativePath(name), fileData.length));
 
  776             return new ArrayList<>(nameToExtractedFileMap.values());
 
FileManager getFileManager()
 
String getMIMEType(AbstractFile file)
 
void addFilesToJob(List< AbstractFile > files)
 
boolean fileIngestIsCancelled()
 
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
 
static String escapeFileName(String fileName)
 
synchronized static Logger getLogger(String name)
 
static Case getCurrentCaseThrows()
 
DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
 
static synchronized IngestServices getInstance()