Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
DocumentEmbeddedContentExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2015-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.embeddedfileextractor;
20 
21 import java.io.File;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.InvalidPathException;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.Map.Entry;
36 import java.util.logging.Level;
37 import org.apache.commons.io.FilenameUtils;
38 import org.apache.commons.io.IOUtils;
39 import org.apache.poi.hwpf.usermodel.Picture;
40 import org.apache.poi.hslf.usermodel.HSLFPictureData;
41 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
42 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
43 import org.apache.poi.hwpf.HWPFDocument;
44 import org.apache.poi.hwpf.model.PicturesTable;
45 import org.apache.poi.sl.usermodel.PictureData.PictureType;
46 import org.apache.poi.ss.usermodel.Workbook;
47 import org.apache.tika.config.TikaConfig;
48 import org.apache.tika.detect.Detector;
49 import org.apache.tika.exception.TikaException;
50 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
51 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
52 import org.apache.tika.metadata.Metadata;
53 import org.apache.tika.metadata.TikaCoreProperties;
54 import org.apache.tika.mime.MediaType;
55 import org.apache.tika.mime.MimeTypeException;
56 import org.apache.tika.parser.AutoDetectParser;
57 import org.apache.tika.parser.ParseContext;
58 import org.apache.tika.parser.Parser;
59 import org.apache.tika.parser.microsoft.OfficeParserConfig;
60 import org.apache.tika.sax.BodyContentHandler;
61 import org.openide.util.NbBundle;
77 import org.xml.sax.ContentHandler;
78 import org.xml.sax.SAXException;
79 
84 class DocumentEmbeddedContentExtractor {
85 
86  private final FileManager fileManager;
87  private final IngestServices services;
88  private static final Logger LOGGER = Logger.getLogger(DocumentEmbeddedContentExtractor.class.getName());
89  private final IngestJobContext context;
90  private String parentFileName;
91  private final String UNKNOWN_IMAGE_NAME_PREFIX = "image_"; //NON-NLS
92  private final FileTypeDetector fileTypeDetector;
93  private final FileTaskExecutor fileTaskExecutor;
94 
95  private String moduleDirRelative;
96  private String moduleDirAbsolute;
97 
98  private AutoDetectParser parser = new AutoDetectParser();
99  private Detector detector = parser.getDetector();
100  private TikaConfig config = TikaConfig.getDefaultConfig();
101 
105  enum SupportedExtractionFormats {
106 
107  DOC("application/msword"), //NON-NLS
108  DOCX("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), //NON-NLS
109  PPT("application/vnd.ms-powerpoint"), //NON-NLS
110  PPTX("application/vnd.openxmlformats-officedocument.presentationml.presentation"), //NON-NLS
111  XLS("application/vnd.ms-excel"), //NON-NLS
112  XLSX("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), //NON-NLS
113  PDF("application/pdf"); //NON-NLS
114 
115  private final String mimeType;
116 
117  SupportedExtractionFormats(final String mimeType) {
118  this.mimeType = mimeType;
119  }
120 
121  @Override
122  public String toString() {
123  return this.mimeType;
124  }
125  }
126  private SupportedExtractionFormats abstractFileExtractionFormat;
127 
128  DocumentEmbeddedContentExtractor(IngestJobContext context, FileTypeDetector fileTypeDetector, String moduleDirRelative, String moduleDirAbsolute, FileTaskExecutor fileTaskExecutor) throws NoCurrentCaseException {
129 
130  this.fileManager = Case.getCurrentCaseThrows().getServices().getFileManager();
131  this.services = IngestServices.getInstance();
132  this.context = context;
133  this.fileTypeDetector = fileTypeDetector;
134  this.moduleDirRelative = moduleDirRelative;
135  this.moduleDirAbsolute = moduleDirAbsolute;
136  this.fileTaskExecutor = fileTaskExecutor;
137  }
138 
148  boolean isContentExtractionSupported(AbstractFile abstractFile) {
149  String abstractFileMimeType = fileTypeDetector.getMIMEType(abstractFile);
150  for (SupportedExtractionFormats s : SupportedExtractionFormats.values()) {
151  if (checkForIngestCancellation(abstractFile)) {
152  break;
153  }
154  if (s.toString().equals(abstractFileMimeType)) {
155  abstractFileExtractionFormat = s;
156  return true;
157  }
158  }
159  return false;
160  }
161 
173  private boolean checkForIngestCancellation(AbstractFile file) {
174  if (fileTaskExecutor != null && context != null && context.fileIngestIsCancelled()) {
175  LOGGER.log(Level.INFO, "Ingest was cancelled. Results extracted from the following document file may be incomplete. Name: {0}Object ID: {1}", new Object[]{file.getName(), file.getId()});
176  return true;
177  }
178  return false;
179  }
180 
190  void extractEmbeddedContent(AbstractFile abstractFile) {
191  List<ExtractedFile> listOfExtractedImages = null;
192  List<AbstractFile> listOfExtractedImageAbstractFiles = null;
193  //save the parent file name with out illegal windows characters
194  this.parentFileName = utf8SanitizeFileName(EmbeddedFileExtractorIngestModule.getUniqueName(abstractFile));
195 
196  // Skip files that already have been unpacked.
197  /*
198  * TODO (Jira-7145): Is the logic of this check correct? Also note that
199  * this suspect code used to have a bug in that makeOutputFolder() was
200  * called, so the directory was always created here if it did not exist,
201  * making this check only a call to AbstractFile.hasChildren() in
202  * practice.
203  */
204  try {
205  if (abstractFile.hasChildren()) {
206  //check if local unpacked dir exists
207  File outputFolder = Paths.get(moduleDirAbsolute, parentFileName).toFile();
208  if (fileTaskExecutor.exists(outputFolder)) {
209  return;
210  }
211  }
212  } catch (TskCoreException | FileTaskExecutor.FileTaskFailedException | InterruptedException e) {
213  LOGGER.log(Level.SEVERE, String.format("Error checking if %s (objID = %d) has already has been processed, skipping", abstractFile.getName(), abstractFile.getId()), e); //NON-NLS
214  return;
215  }
216  if (checkForIngestCancellation(abstractFile)) {
217  return;
218  }
219  // Call the appropriate extraction method based on mime type
220  switch (abstractFileExtractionFormat) {
221  case DOCX:
222  case PPTX:
223  case XLSX:
224  listOfExtractedImages = extractEmbeddedContentFromOOXML(abstractFile);
225  break;
226  case DOC:
227  listOfExtractedImages = extractEmbeddedImagesFromDoc(abstractFile);
228  break;
229  case PPT:
230  listOfExtractedImages = extractEmbeddedImagesFromPpt(abstractFile);
231  break;
232  case XLS:
233  listOfExtractedImages = extractImagesFromXls(abstractFile);
234  break;
235  case PDF:
236  listOfExtractedImages = extractEmbeddedContentFromPDF(abstractFile);
237  break;
238  default:
239  break;
240  }
241 
242  if (listOfExtractedImages == null) {
243  return;
244  }
245  // the common task of adding abstractFile to derivedfiles is performed.
246  listOfExtractedImageAbstractFiles = new ArrayList<>();
247  for (ExtractedFile extractedImage : listOfExtractedImages) {
248  if (checkForIngestCancellation(abstractFile)) {
249  return;
250  }
251  try {
252  listOfExtractedImageAbstractFiles.add(fileManager.addDerivedFile(extractedImage.getFileName(), extractedImage.getLocalPath(), extractedImage.getSize(),
253  extractedImage.getCtime(), extractedImage.getCrtime(), extractedImage.getAtime(), extractedImage.getAtime(),
254  true, abstractFile, null, EmbeddedFileExtractorModuleFactory.getModuleName(), null, null, TskData.EncodingType.XOR1));
255  } catch (TskCoreException ex) {
256  LOGGER.log(Level.SEVERE, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImage.addToDB.exception.msg"), ex); //NON-NLS
257  }
258  }
259  if (!listOfExtractedImages.isEmpty()) {
260  services.fireModuleContentEvent(new ModuleContentEvent(abstractFile));
261  context.addFilesToJob(listOfExtractedImageAbstractFiles);
262  }
263  }
264 
274  private List<ExtractedFile> extractEmbeddedContentFromOOXML(AbstractFile abstractFile) {
275  Metadata metadata = new Metadata();
276 
277  ParseContext parseContext = new ParseContext();
278  parseContext.set(Parser.class, parser);
279 
280  // Passing -1 to the BodyContentHandler constructor disables the Tika
281  // write limit (which defaults to 100,000 characters.
282  ContentHandler contentHandler = new BodyContentHandler(-1);
283 
284  // Use the more memory efficient Tika SAX parsers for DOCX and
285  // PPTX files (it already uses SAX for XLSX).
286  OfficeParserConfig officeParserConfig = new OfficeParserConfig();
287  officeParserConfig.setUseSAXPptxExtractor(true);
288  officeParserConfig.setUseSAXDocxExtractor(true);
289  parseContext.set(OfficeParserConfig.class, officeParserConfig);
290  EmbeddedDocumentExtractor extractor = new EmbeddedContentExtractor(parseContext);
291  parseContext.set(EmbeddedDocumentExtractor.class, extractor);
292  ReadContentInputStream stream = new ReadContentInputStream(abstractFile);
293  if (checkForIngestCancellation(abstractFile)) {
294  return null; //null will cause the calling method to return.
295  }
296  try {
297  parser.parse(stream, contentHandler, metadata, parseContext);
298  } catch (IOException | SAXException | TikaException ex) {
299  LOGGER.log(Level.WARNING, "Error while parsing file, skipping: " + abstractFile.getName(), ex); //NON-NLS
300  return null;
301  }
302 
303  return ((EmbeddedContentExtractor) extractor).getExtractedImages();
304  }
305 
314  private List<ExtractedFile> extractEmbeddedImagesFromDoc(AbstractFile af) {
315  List<Picture> listOfAllPictures;
316 
317  try {
318  HWPFDocument doc = new HWPFDocument(new ReadContentInputStream(af));
319  PicturesTable pictureTable = doc.getPicturesTable();
320  listOfAllPictures = pictureTable.getAllPictures();
321  } catch (Exception ex) {
322  // IOException:
323  // Thrown when the document has issues being read.
324 
325  // IllegalArgumentException:
326  // This will catch OldFileFormatException, which is thrown when the
327  // document's format is Word 95 or older. Alternatively, this is
328  // thrown when attempting to load an RTF file as a DOC file.
329  // However, our code verifies the file format before ever running it
330  // through the EmbeddedContentExtractor. This exception gets thrown in the
331  // "IN10-0137.E01" image regardless. The reason is unknown.
332  // IndexOutOfBoundsException:
333  // NullPointerException:
334  // These get thrown in certain images. The reason is unknown. It is
335  // likely due to problems with the file formats that POI is poorly
336  // handling.
337  //Any runtime exception escaping
338  LOGGER.log(Level.WARNING, "Word document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
339  return null;
340  }
341 
342  Path outputFolderPath;
343  if (listOfAllPictures.isEmpty()) {
344  return null;
345  } else {
346  outputFolderPath = getOutputFolderPath(this.parentFileName);
347  }
348  if (outputFolderPath == null) {
349  return null;
350  }
351  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
352  byte[] data = null;
353  int pictureNumber = 0; //added to ensure uniqueness in cases where suggestFullFileName returns duplicates
354  for (Picture picture : listOfAllPictures) {
355  if (checkForIngestCancellation(af)) {
356  return null; //null will cause the calling method to return.
357  }
358  String fileName = UNKNOWN_IMAGE_NAME_PREFIX + pictureNumber + "." + picture.suggestFileExtension();
359  try {
360  data = picture.getContent();
361  } catch (Exception ex) {
362  return null;
363  }
364  writeExtractedImage(Paths.get(outputFolderPath.toString(), fileName).toString(), data);
365  // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
366  listOfExtractedImages.add(new ExtractedFile(fileName, getFileRelativePath(fileName), picture.getSize()));
367  pictureNumber++;
368  }
369 
370  return listOfExtractedImages;
371  }
372 
381  private List<ExtractedFile> extractEmbeddedImagesFromPpt(AbstractFile af) {
382  List<HSLFPictureData> listOfAllPictures = null;
383 
384  try {
385  HSLFSlideShow ppt = new HSLFSlideShow(new ReadContentInputStream(af));
386  listOfAllPictures = ppt.getPictureData();
387  } catch (Exception ex) {
388  // IllegalArgumentException:
389  // This will catch OldFileFormatException, which is thrown when the
390  // document version is unsupported. The IllegalArgumentException may
391  // also get thrown for unknown reasons.
392 
393  // IOException:
394  // Thrown when the document has issues being read.
395  // IndexOutOfBoundsException:
396  // This gets thrown in certain images. The reason is unknown. It is
397  // likely due to problems with the file formats that POI is poorly
398  // handling.
399  LOGGER.log(Level.WARNING, "PPT container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
400  return null;
401  }
402 
403  // if no images are extracted from the PPT, return null, else initialize
404  // the output folder for image extraction.
405  Path outputFolderPath;
406  if (listOfAllPictures.isEmpty()) {
407  return null;
408  } else {
409  outputFolderPath = getOutputFolderPath(this.parentFileName);
410  }
411  if (outputFolderPath == null) {
412  return null;
413  }
414 
415  // extract the content to the above initialized outputFolder.
416  // extraction path - outputFolder/image_number.ext
417  int i = 0;
418  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
419  byte[] data = null;
420  for (HSLFPictureData pictureData : listOfAllPictures) {
421  if (checkForIngestCancellation(af)) {
422  return null; //null will cause the calling method to return.
423  }
424  // Get image extension, generate image name, write image to the module
425  // output folder, add it to the listOfExtractedImageAbstractFiles
426  PictureType type = pictureData.getType();
427  String ext;
428  switch (type) {
429  case JPEG:
430  ext = ".jpg"; //NON-NLS
431  break;
432  case PNG:
433  ext = ".png"; //NON-NLS
434  break;
435  case WMF:
436  ext = ".wmf"; //NON-NLS
437  break;
438  case EMF:
439  ext = ".emf"; //NON-NLS
440  break;
441  case PICT:
442  ext = ".pict"; //NON-NLS
443  break;
444  default:
445  continue;
446  }
447  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + ext; //NON-NLS
448  try {
449  data = pictureData.getData();
450  } catch (Exception ex) {
451  return null;
452  }
453  writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
454  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
455  i++;
456  }
457  return listOfExtractedImages;
458  }
459 
468  private List<ExtractedFile> extractImagesFromXls(AbstractFile af) {
469  List<? extends org.apache.poi.ss.usermodel.PictureData> listOfAllPictures = null;
470 
471  try {
472  Workbook xls = new HSSFWorkbook(new ReadContentInputStream(af));
473  listOfAllPictures = xls.getAllPictures();
474  } catch (Exception ex) {
475  // IllegalArgumentException:
476  // This will catch OldFileFormatException, which is thrown when the
477  // document version is unsupported. The IllegalArgumentException may
478  // also get thrown for unknown reasons.
479 
480  // IOException:
481  // Thrown when the document has issues being read.
482  // LeftoverDataException:
483  // This is thrown for poorly formatted files that have more data
484  // than expected.
485  // RecordFormatException:
486  // This is thrown for poorly formatted files that have less data
487  // that expected.
488  // IllegalArgumentException:
489  // IndexOutOfBoundsException:
490  // These get thrown in certain images. The reason is unknown. It is
491  // likely due to problems with the file formats that POI is poorly
492  // handling.
493  LOGGER.log(Level.WARNING, "Excel (.xls) document container could not be initialized. Reason: {0}", ex.getMessage()); //NON-NLS
494  return null;
495  }
496 
497  // if no images are extracted from the PPT, return null, else initialize
498  // the output folder for image extraction.
499  Path outputFolderPath;
500  if (listOfAllPictures.isEmpty()) {
501  return null;
502  } else {
503  outputFolderPath = getOutputFolderPath(this.parentFileName);
504  }
505  if (outputFolderPath == null) {
506  return null;
507  }
508 
509  int i = 0;
510  List<ExtractedFile> listOfExtractedImages = new ArrayList<>();
511  byte[] data = null;
512  for (org.apache.poi.ss.usermodel.PictureData pictureData : listOfAllPictures) {
513  if (checkForIngestCancellation(af)) {
514  return null; //null will cause the calling method to return.
515  }
516  String imageName = UNKNOWN_IMAGE_NAME_PREFIX + i + "." + pictureData.suggestFileExtension(); //NON-NLS
517  try {
518  data = pictureData.getData();
519  } catch (Exception ex) {
520  return null;
521  }
522  writeExtractedImage(Paths.get(outputFolderPath.toString(), imageName).toString(), data);
523  listOfExtractedImages.add(new ExtractedFile(imageName, getFileRelativePath(imageName), pictureData.getData().length));
524  i++;
525  }
526  return listOfExtractedImages;
527 
528  }
529 
537  private List<ExtractedFile> extractEmbeddedContentFromPDF(AbstractFile abstractFile) {
538  Path outputDirectory = getOutputFolderPath(parentFileName);
539  if (outputDirectory == null) {
540  return Collections.emptyList();
541  }
542  PDFAttachmentExtractor pdfExtractor = new PDFAttachmentExtractor(parser);
543  try {
544  //Get map of attachment name -> location disk.
545  Map<String, PDFAttachmentExtractor.NewResourceData> extractedAttachments = pdfExtractor.extract(
546  new ReadContentInputStream(abstractFile), abstractFile.getId(),
547  outputDirectory);
548 
549  //Convert output to hook into the existing logic for creating derived files
550  List<ExtractedFile> extractedFiles = new ArrayList<>();
551  for (Entry<String, PDFAttachmentExtractor.NewResourceData> pathEntry : extractedAttachments.entrySet()) {
552  if (checkForIngestCancellation(abstractFile)) {
553  return null; //null will cause the calling method to return.
554  }
555  String fileName = pathEntry.getKey();
556  Path writeLocation = pathEntry.getValue().getPath();
557  int fileSize = pathEntry.getValue().getLength();
558  extractedFiles.add(new ExtractedFile(fileName,
559  getFileRelativePath(writeLocation.getFileName().toString()),
560  fileSize));
561  }
562  return extractedFiles;
563  } catch (IOException | SAXException | TikaException | InvalidPathException ex) {
564  LOGGER.log(Level.WARNING, "Error attempting to extract attachments from PDFs for file Name: " + abstractFile.getName() + " ID: " + abstractFile.getId(), ex); //NON-NLS
565  }
566  return Collections.emptyList();
567  }
568 
576  private void writeExtractedImage(String outputPath, byte[] data) {
577  try (EncodedFileOutputStream fos = new EncodedFileOutputStream(new FileOutputStream(outputPath), TskData.EncodingType.XOR1)) {
578  fos.write(data);
579  } catch (IOException ex) {
580  LOGGER.log(Level.WARNING, "Could not write to the provided location: " + outputPath, ex); //NON-NLS
581  }
582  }
583 
594  private Path getOutputFolderPath(String parentFileName) {
595  Path outputFolderPath = Paths.get(moduleDirAbsolute, parentFileName);
596  try {
597  File outputFolder = outputFolderPath.toFile();
598  if (!fileTaskExecutor.exists(outputFolder)) {
599  if (!fileTaskExecutor.mkdirs(outputFolder)) {
600  outputFolderPath = null;
601  }
602  }
603  return outputFolderPath;
604  } catch (SecurityException | FileTaskFailedException | InterruptedException ex) {
605  LOGGER.log(Level.SEVERE, String.format("Failed to find or create %s", outputFolderPath), ex);
606  return null;
607  }
608  }
609 
619  private String getFileRelativePath(String fileName) {
620  return Paths.get(moduleDirRelative, this.parentFileName, fileName).toString();
621  }
622 
631  private static String utf8SanitizeFileName(String fileName) {
632  Charset charset = StandardCharsets.UTF_8;
633  return charset.decode(charset.encode(escapeFileName(fileName))).toString();
634  }
635 
641  private static class ExtractedFile {
642  //String fileName, String localPath, long size, long ctime, long crtime,
643  //long atime, long mtime, boolean isFile, AbstractFile parentFile, String rederiveDetails, String toolName, String toolVersion, String otherDetails
644 
645  private final String fileName;
646  private final String localPath;
647  private final long size;
648  private final long ctime;
649  private final long crtime;
650  private final long atime;
651  private final long mtime;
652 
653  ExtractedFile(String fileName, String localPath, long size) {
654  this(fileName, localPath, size, 0, 0, 0, 0);
655  }
656 
657  ExtractedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime) {
658  this.fileName = fileName;
659  this.localPath = localPath;
660  this.size = size;
661  this.ctime = ctime;
662  this.crtime = crtime;
663  this.atime = atime;
664  this.mtime = mtime;
665  }
666 
667  public String getFileName() {
668  return fileName;
669  }
670 
671  public String getLocalPath() {
672  return localPath;
673  }
674 
675  public long getSize() {
676  return size;
677  }
678 
679  public long getCtime() {
680  return ctime;
681  }
682 
683  public long getCrtime() {
684  return crtime;
685  }
686 
687  public long getAtime() {
688  return atime;
689  }
690 
691  public long getMtime() {
692  return mtime;
693  }
694  }
695 
701  private class EmbeddedContentExtractor extends ParsingEmbeddedDocumentExtractor {
702 
703  private int fileCount = 0;
704  // Map of file name to ExtractedFile instance. This can revert to a
705  // plain old list after we upgrade to Tika 1.16 or above.
706  private final Map<String, ExtractedFile> nameToExtractedFileMap = new HashMap<>();
707 
708  private EmbeddedContentExtractor(ParseContext context) {
709  super(context);
710  }
711 
712  @Override
713  public boolean shouldParseEmbedded(Metadata metadata) {
714  return true;
715  }
716 
717  @Override
718  public void parseEmbedded(InputStream stream, ContentHandler handler,
719  Metadata metadata, boolean outputHtml) throws SAXException, IOException {
720 
721  // Get the mime type for the embedded document
722  MediaType contentType = detector.detect(stream, metadata);
723 
724  if (!contentType.getType().equalsIgnoreCase("image") //NON-NLS
725  && !contentType.getType().equalsIgnoreCase("video") //NON-NLS
726  && !contentType.getType().equalsIgnoreCase("application") //NON-NLS
727  && !contentType.getType().equalsIgnoreCase("audio")) { //NON-NLS
728  return;
729  }
730 
731  // try to get the name of the embedded file from the metadata
732  String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
733 
734  // TODO: This can be removed after we upgrade to Tika 1.16 or
735  // above. The 1.16 version of Tika keeps track of files that
736  // have been seen before.
737  if (nameToExtractedFileMap.containsKey(name)) {
738  return;
739  }
740 
741  if (name == null) {
742  fileCount++;
743  name = UNKNOWN_IMAGE_NAME_PREFIX + fileCount;
744  } else {
745  //make sure to select only the file name (not any directory paths
746  //that might be included in the name) and make sure
747  //to normalize the name
748  name = FilenameUtils.normalize(FilenameUtils.getName(name));
749  //remove any illegal characters from name
750  name = utf8SanitizeFileName(name);
751  }
752 
753  // Get the suggested extension based on mime type.
754  if (name.indexOf('.') == -1) {
755  try {
756  name += config.getMimeRepository().forName(contentType.toString()).getExtension();
757  } catch (MimeTypeException ex) {
758  LOGGER.log(Level.WARNING, "Failed to get suggested extension for the following type: " + contentType.toString(), ex); //NON-NLS
759  }
760  }
761 
762  Path outputFolderPath = getOutputFolderPath(parentFileName);
763  if (outputFolderPath != null) {
764  File extractedFile = new File(Paths.get(outputFolderPath.toString(), name).toString());
765  byte[] fileData = IOUtils.toByteArray(stream);
766  writeExtractedImage(extractedFile.getAbsolutePath(), fileData);
767  nameToExtractedFileMap.put(name, new ExtractedFile(name, getFileRelativePath(name), fileData.length));
768  }
769  }
770 
776  public List<ExtractedFile> getExtractedImages() {
777  return new ArrayList<>(nameToExtractedFileMap.values());
778  }
779  }
780 }
void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
void addFilesToJob(List< AbstractFile > files)
void fireModuleContentEvent(ModuleContentEvent moduleContentEvent)
static String escapeFileName(String fileName)
Definition: FileUtil.java:169
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
DerivedFile addDerivedFile(String fileName, String localPath, long size, long ctime, long crtime, long atime, long mtime, boolean isFile, Content parentObj, String rederiveDetails, String toolName, String toolVersion, String otherDetails, TskData.EncodingType encodingType)
static synchronized IngestServices getInstance()

Copyright © 2012-2024 Sleuth Kit Labs. Generated on: Mon Feb 17 2025
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.