Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
DocumentEmbeddedContentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2015-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.File;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.InvalidPathException;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.Map.Entry;
36 import java.util.logging.Level;
37 import org.apache.commons.io.FilenameUtils;
38 import org.apache.commons.io.IOUtils;
39 import org.apache.poi.hwpf.usermodel.Picture;
40 import org.apache.poi.hslf.usermodel.HSLFPictureData;
41 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
42 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
43 import org.apache.poi.hwpf.HWPFDocument;
44 import org.apache.poi.hwpf.model.PicturesTable;
45 import org.apache.poi.sl.usermodel.PictureData.PictureType;
46 import org.apache.poi.ss.usermodel.Workbook;
47 import org.apache.tika.config.TikaConfig;
48 import org.apache.tika.detect.Detector;
49 import org.apache.tika.exception.TikaException;
50 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
51 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
52 import org.apache.tika.metadata.Metadata;
53 import org.apache.tika.mime.MediaType;
54 import org.apache.tika.mime.MimeTypeException;
55 import org.apache.tika.parser.AutoDetectParser;
56 import org.apache.tika.parser.ParseContext;
57 import org.apache.tika.parser.Parser;
58 import org.apache.tika.parser.microsoft.OfficeParserConfig;
59 import org.apache.tika.sax.BodyContentHandler;
60 import org.openide.util.NbBundle;
71 import org.sleuthkit.datamodel.AbstractFile;
72 import org.sleuthkit.datamodel.EncodedFileOutputStream;
73 import org.sleuthkit.datamodel.ReadContentInputStream;
74 import org.sleuthkit.datamodel.TskCoreException;
75 import org.sleuthkit.datamodel.TskData;
76 import org.xml.sax.ContentHandler;
77 import org.xml.sax.SAXException;
78 
83 class DocumentEmbeddedContentExtractor {
84 
85  private final FileManager fileManager;
86  private final IngestServices services;
87  private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
88  private final IngestJobContext context;
89  private String parentFileName;
90  private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
91  private final FileTypeDetector fileTypeDetector;
92  private final FileTaskExecutor fileTaskExecutor;
93 
94  private String moduleDirRelative;
95  private String moduleDirAbsolute;
96 
97  private AutoDetectParser parser = new AutoDetectParser();
98  private Detector detector = parser.getDetector();
99  private TikaConfig config = TikaConfig.getDefaultConfig();
100 
104  enum SupportedExtractionFormats {
105 
106  DOC("application/msword"), //NON-NLS
107  DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
108  PPT("application/vnd.ms-powerpoint"), //NON-NLS
109  PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
110  XLS("application/vnd.ms-excel"), //NON-NLS
111  XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
112  PDF("application/pdf"); //NON-NLS
113 
114  private final String mimeType;
115 
116  SupportedExtractionFormats(final String mimeType) {
117  this.mimeType = mimeType;
118  }
119 
120  @Override
121  public String toString() {
122  return this.mimeType;
123  }
124  }
125  private SupportedExtractionFormats abstractFileExtractionFormat;
126 
127  DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute, FileTaskExecutor fileTaskExecutor) throws NoCurrentCaseException {
128 
129  this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
130  this.services = IngestServices.getInstance();
131  this.context = context;
132  this.fileTypeDetector = fileTypeDetector;
133  this.moduleDirRelative = moduleDirRelative;
134  this.moduleDirAbsolute = moduleDirAbsolute;
135  this.fileTaskExecutor = fileTaskExecutor;
136  }
137 
147  boolean isContentExtractionSupported(AbstractFile abstractFile) {
148  String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
149  for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
150  if (checkForIngestCancellation(abstractFile)) {
151  break;
152  }
153  if (s.toString().equals(abstractFileMimeType)) {
154  abstractFileExtractionFormat = s;
155  return true;
156  }
157  }
158  return false;
159  }
160 
172  private boolean checkForIngestCancellation(AbstractFile file) {
173  if (fileTaskExecutor != null && context != null && context.fileIngestIsCancelled()) {
174  LOGGER.log(Level.INFO, "Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}", new Object[]{file.getName(), file.getId()});
175  return true;
176  }
177  return false;
178  }
179 
189  void extractEmbeddedContent(AbstractFile abstractFile) {
190  List<ExtractedFile> listOfExtractedImages = null;
191  List<AbstractFile> listOfExtractedImageAbstractFiles = null;
192  //save the parent file name with out illegal windows characters
193  this.parentFileName = utf8SanitizeFileName(EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile));
194 
195  // Skip files that already have been unpacked.
196  /*
197  * TODO (Jira-7145): Is the logic of this check correct? Also note that
198  * this suspect code used to have a bug in that makeOutputFolder() was
199  * called, so the directory was always created here if it did not exist,
200  * making this check only a call to AbstractFile.hasChildren() in
201  * practice.
202  */
203  try {
204  if (abstractFile.hasChildren()) {
205  //check if local unpacked dir exists
206  File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
207  if (fileTaskExecutor.exists(outputFolder)) {
208  return;
209  }
210  }
211  } catch (TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
212  LOGGER.log(Level.SEVERE, String.format("Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e); //NON-NLS
213  return;
214  }
215  if (checkForIngestCancellation(abstractFile)) {
216  return;
217  }
218  // Call the appropriate extraction method based on mime type
219  switch (abstractFileExtractionFormat) {
220  case DOCX:
221  case PPTX:
222  case XLSX:
223  listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
224  break;
225  case DOC:
226  listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
227  break;
228  case PPT:
229  listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
230  break;
231  case XLS:
232  listOfExtractedImages = extractImagesFromXls(abstractFile);
233  break;
234  case PDF:
235  listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
236  break;
237  default:
238  break;
239  }
240 
241  if (listOfExtractedImages == null) {
242  return;
243  }
244  // the common task of adding abstractFile to derivedfiles is performed.
245  listOfExtractedImageAbstractFiles = new ArrayList<>();
246  for (ExtractedFile extractedImage : listOfExtractedImages) {
247  if (checkForIngestCancellation(abstractFile)) {
248  return;
249  }
250  try {
251  listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
252  extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
253  true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
254  } catch (TskCoreException ex) {
255  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
256  }
257  }
258  if (!listOfExtractedImages.isEmpty()) {
259  services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
260  context.addFilesToJob(listOfExtractedImageAbstractFiles);
261  }
262  }
263 
273  private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
274  Metadata metadata = new Metadata();
275 
276  ParseContext parseContext = new ParseContext();
277  parseContext.set(Parser.class, parser);
278 
279  // Passing -1 to the BodyContentHandler constructor disables the Tika
280  // write limit (which defaults to 100,000 characters.
281  ContentHandler contentHandler = new BodyContentHandler(-1);
282 
283  // Use the more memory efficient Tika SAX parsers for DOCX and
284  // PPTX files (it already uses SAX for XLSX).
285  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
286  officeParserConfig.setUseSAXPptxExtractor(true);
287  officeParserConfig.setUseSAXDocxExtractor(true);
288  parseContext.set(OfficeParserConfig.class, officeParserConfig);
289  EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
290  parseContext.set(EmbeddedDocumentExtractor.class, extractor);
291  ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
292  if (checkForIngestCancellation(abstractFile)) {
293  return null; //null will cause the calling method to return.
294  }
295  try {
296  parser.parse(stream, contentHandler, metadata, parseContext);
297  } catch (IOException | SAXException | TikaException ex) {
298  LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
299  return null;
300  }
301 
302  return ((EmbeddedContentExtractor) extractor).getExtractedImages();
303  }
304 
313  private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
314  List<Picture> listOfAllPictures;
315 
316  try {
317  HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
318  PicturesTable pictureTable = doc.getPicturesTable();
319  listOfAllPictures = pictureTable.getAllPictures();
320  } catch (Exception ex) {
321  // IOException:
322  // Thrown when the document has issues being read.
323 
324  // IllegalArgumentException:
325  // This will catch OldFileFormatException, which is thrown when the
326  // document's format is Word 95 or older. Alternatively, this is
327  // thrown when attempting to load an RTF file as a DOC file.
328  // However, our code verifies the file format before ever running it
329  // through the EmbeddedContentExtractor. This exception gets thrown in the
330  // "IN10-0137.E01" image regardless. The reason is unknown.
331  // IndexOutOfBoundsException:
332  // NullPointerException:
333  // These get thrown in certain images. The reason is unknown. It is
334  // likely due to problems with the file formats that POI is poorly
335  // handling.
336  //Any runtime exception escaping
337  LOGGER.log(Level.WARNING, "Word document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
338  return null;
339  }
340 
341  Path outputFolderPath;
342  if (listOfAllPictures.isEmpty()) {
343  return null;
344  } else {
345  outputFolderPath = getOutputFolderPath(this.parentFileName);
346  }
347  if (outputFolderPath == null) {
348  return null;
349  }
350  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
351  byte[] data = null;
352  int pictureNumber = 0; //added to ensure uniqueness in cases where suggestFullFileName returns duplicates
353  for (Picture picture : listOfAllPictures) {
354  if (checkForIngestCancellation(af)) {
355  return null; //null will cause the calling method to return.
356  }
357  String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber + "." + picture.suggestFileExtension();
358  try {
359  data = picture.getContent();
360  } catch (Exception ex) {
361  return null;
362  }
363  writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
364  // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
365  listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
366  pictureNumber++;
367  }
368 
369  return listOfExtractedImages;
370  }
371 
380  private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
381  List<HSLFPictureData> listOfAllPictures = null;
382 
383  try {
384  HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
385  listOfAllPictures = ppt.getPictureData();
386  } catch (Exception ex) {
387  // IllegalArgumentException:
388  // This will catch OldFileFormatException, which is thrown when the
389  // document version is unsupported. The IllegalArgumentException may
390  // also get thrown for unknown reasons.
391 
392  // IOException:
393  // Thrown when the document has issues being read.
394  // IndexOutOfBoundsException:
395  // This gets thrown in certain images. The reason is unknown. It is
396  // likely due to problems with the file formats that POI is poorly
397  // handling.
398  LOGGER.log(Level.WARNING, "PPT container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
399  return null;
400  }
401 
402  // if no images are extracted from the PPT, return null, else initialize
403  // the output folder for image extraction.
404  Path outputFolderPath;
405  if (listOfAllPictures.isEmpty()) {
406  return null;
407  } else {
408  outputFolderPath = getOutputFolderPath(this.parentFileName);
409  }
410  if (outputFolderPath == null) {
411  return null;
412  }
413 
414  // extract the content to the above initialized outputFolder.
415  // extraction path - outputFolder/image_number.ext
416  int i = 0;
417  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
418  byte[] data = null;
419  for (HSLFPictureData pictureData : listOfAllPictures) {
420  if (checkForIngestCancellation(af)) {
421  return null; //null will cause the calling method to return.
422  }
423  // Get image extension, generate image name, write image to the module
424  // output folder, add it to the listOfExtractedImageAbstractFiles
425  PictureType type = pictureData.getType();
426  String ext;
427  switch (type) {
428  case JPEG:
429  ext = ".jpg"; //NON-NLS
430  break;
431  case PNG:
432  ext = ".png"; //NON-NLS
433  break;
434  case WMF:
435  ext = ".wmf"; //NON-NLS
436  break;
437  case EMF:
438  ext = ".emf"; //NON-NLS
439  break;
440  case PICT:
441  ext = ".pict"; //NON-NLS
442  break;
443  default:
444  continue;
445  }
446  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
447  try {
448  data = pictureData.getData();
449  } catch (Exception ex) {
450  return null;
451  }
452  writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
453  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
454  i++;
455  }
456  return listOfExtractedImages;
457  }
458 
467  private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
468  List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
469 
470  try {
471  Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
472  listOfAllPictures = xls.getAllPictures();
473  } catch (Exception ex) {
474  // IllegalArgumentException:
475  // This will catch OldFileFormatException, which is thrown when the
476  // document version is unsupported. The IllegalArgumentException may
477  // also get thrown for unknown reasons.
478 
479  // IOException:
480  // Thrown when the document has issues being read.
481  // LeftoverDataException:
482  // This is thrown for poorly formatted files that have more data
483  // than expected.
484  // RecordFormatException:
485  // This is thrown for poorly formatted files that have less data
486  // that expected.
487  // IllegalArgumentException:
488  // IndexOutOfBoundsException:
489  // These get thrown in certain images. The reason is unknown. It is
490  // likely due to problems with the file formats that POI is poorly
491  // handling.
492  LOGGER.log(Level.WARNING, "Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
493  return null;
494  }
495 
496  // if no images are extracted from the PPT, return null, else initialize
497  // the output folder for image extraction.
498  Path outputFolderPath;
499  if (listOfAllPictures.isEmpty()) {
500  return null;
501  } else {
502  outputFolderPath = getOutputFolderPath(this.parentFileName);
503  }
504  if (outputFolderPath == null) {
505  return null;
506  }
507 
508  int i = 0;
509  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
510  byte[] data = null;
511  for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
512  if (checkForIngestCancellation(af)) {
513  return null; //null will cause the calling method to return.
514  }
515  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
516  try {
517  data = pictureData.getData();
518  } catch (Exception ex) {
519  return null;
520  }
521  writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
522  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
523  i++;
524  }
525  return listOfExtractedImages;
526 
527  }
528 
536  private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
537  Path outputDirectory = getOutputFolderPath(parentFileName);
538  if (outputDirectory == null) {
539  return Collections.emptyList();
540  }
541  PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
542  try {
543  //Get map of attachment name -> location disk.
544  Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
545  new ReadContentInputStream(abstractFile), abstractFile.getId(),
546  outputDirectory);
547 
548  //Convert output to hook into the existing logic for creating derived files
549  List<ExtractedFile> extractedFiles = new ArrayList<>();
550  for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
551  if (checkForIngestCancellation(abstractFile)) {
552  return null; //null will cause the calling method to return.
553  }
554  String fileName = pathEntry.getKey();
555  Path writeLocation = pathEntry.getValue().getPath();
556  int fileSize = pathEntry.getValue().getLength();
557  extractedFiles.add(new ExtractedFile(fileName,
558  getFileRelativePath(writeLocation.getFileName().toString()),
559  fileSize));
560  }
561  return extractedFiles;
562  } catch (IOException | SAXException | TikaException | InvalidPathException ex) {
563  LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() + " ID: " + abstractFile.getId(), ex); //NON-NLS
564  }
565  return Collections.emptyList();
566  }
567 
575  private void writeExtractedImage(String outputPath, byte[] data) {
576  try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
577  fos.write(data);
578  } catch (IOException ex) {
579  LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
580  }
581  }
582 
593  private Path getOutputFolderPath(String parentFileName) {
594  Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
595  try {
596  File outputFolder = outputFolderPath.toFile();
597  if (!fileTaskExecutor.exists(outputFolder)) {
598  if (!fileTaskExecutor.mkdirs(outputFolder)) {
599  outputFolderPath = null;
600  }
601  }
602  return outputFolderPath;
603  } catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
604  LOGGER.log(Level.SEVERE, String.format("Failed to find or create %s", outputFolderPath), ex);
605  return null;
606  }
607  }
608 
618  private String getFileRelativePath(String fileName) {
619  return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
620  }
621 
630  private static String utf8SanitizeFileName(String fileName) {
631  Charset charset = StandardCharsets.UTF_8;
632  return charset.decode(charset.encode(escapeFileName(fileName))).toString();
633  }
634 
640  private static class ExtractedFile {
641  //String fileName, String localPath, long size, long ctime, long crtime,
642  //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
643 
644  private final String fileName;
645  private final String localPath;
646  private final long size;
647  private final long ctime;
648  private final long crtime;
649  private final long atime;
650  private final long mtime;
651 
652  ExtractedFile(String fileName, String localPath, long size) {
653  this(fileName, localPath, size, 0, 0, 0, 0);
654  }
655 
656  ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
657  this.fileName = fileName;
658  this.localPath = localPath;
659  this.size = size;
660  this.ctime = ctime;
661  this.crtime = crtime;
662  this.atime = atime;
663  this.mtime = mtime;
664  }
665 
666  public String getFileName() {
667  return fileName;
668  }
669 
670  public String getLocalPath() {
671  return localPath;
672  }
673 
674  public long getSize() {
675  return size;
676  }
677 
678  public long getCtime() {
679  return ctime;
680  }
681 
682  public long getCrtime() {
683  return crtime;
684  }
685 
686  public long getAtime() {
687  return atime;
688  }
689 
690  public long getMtime() {
691  return mtime;
692  }
693  }
694 
700  private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
701 
702  private int fileCount = 0;
703  // Map of file name to ExtractedFile instance. This can revert to a
704  // plain old list after we upgrade to Tika 1.16 or above.
705  private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
706 
707  private EmbeddedContentExtractor(ParseContext context) {
708  super(context);
709  }
710 
711  @Override
712  public boolean shouldParseEmbedded(Metadata metadata) {
713  return true;
714  }
715 
716  @Override
717  public void parseEmbedded(InputStream stream, ContentHandler handler,
718  Metadata metadata, boolean outputHtml) throws SAXException, IOException {
719 
720  // Get the mime type for the embedded document
721  MediaType contentType = detector.detect(stream, metadata);
722 
723  if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
724  && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
725  && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
726  && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
727  return;
728  }
729 
730  // try to get the name of the embedded file from the metadata
731  String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
732 
733  // TODO: This can be removed after we upgrade to Tika 1.16 or
734  // above. The 1.16 version of Tika keeps track of files that
735  // have been seen before.
736  if (nameToExtractedFileMap.containsKey(name)) {
737  return;
738  }
739 
740  if (name == null) {
741  fileCount++;
742  name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount;
743  } else {
744  //make sure to select only the file name (not any directory paths
745  //that might be included in the name) and make sure
746  //to normalize the name
747  name = FilenameUtils.normalize(FilenameUtils.getName(name));
748  //remove any illegal characters from name
749  name = utf8SanitizeFileName(name);
750  }
751 
752  // Get the suggested extension based on mime type.
753  if (name.indexOf('.') == -1) {
754  try {
755  name += config.getMimeRepository().forName(contentType.toString()).getExtension();
756  } catch (MimeTypeException ex) {
757  LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
758  }
759  }
760 
761  Path outputFolderPath = getOutputFolderPath(parentFileName);
762  if (outputFolderPath != null) {
763  File extractedFile = new File(Paths.get(outputFolderPath.toString(), name).toString());
764  byte[] fileData = IOUtils.toByteArray(stream);
765  writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
766  nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
767  }
768  }
769 
775  public List<ExtractedFile> getExtractedImages() {
776  return new ArrayList<>(nameToExtractedFileMap.values());
777  }
778  }
779 }
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
void addFilesToJob(List< AbstractFile > files)
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
Definition: FileUtil.java:169
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
static synchronized IngestServices getInstance()

Copyright © 2012-2022 Basis Technology. Generated on: Tue Feb 6 2024
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.