Autopsy  4.19.3
Graphical digital forensics platform for The Sleuth Kit and other tools.
FileTypeDetector.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2020 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.modules.filetypeid;
20 
21 import java.nio.charset.Charset;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.List;
25 import java.util.SortedSet;
26 import java.util.TreeSet;
27 import java.util.logging.Level;
28 import java.util.stream.Collectors;
29 import org.apache.tika.Tika;
30 import org.apache.tika.io.TikaInputStream;
31 import org.apache.tika.mime.MimeTypes;
34 import org.sleuthkit.datamodel.AbstractFile;
35 import org.sleuthkit.datamodel.ReadContentInputStream;
36 import org.sleuthkit.datamodel.TskCoreException;
37 import org.sleuthkit.datamodel.TskData;
38 
46 public class FileTypeDetector {
47 
48  private static final Logger logger = Logger.getLogger(FileTypeDetector.class.getName());
49  private static final Tika tika = new Tika();
50  private static final int SLACK_FILE_THRESHOLD = 4096;
51  private final List<FileType> userDefinedFileTypes;
52  private final List<FileType> autopsyDefinedFileTypes;
53  private static SortedSet<String> tikaDetectedTypes;
54  private final int defaultBufferSize = 600; // Number of bytes to initially read from the file. Should cover most signatures.
55 
66  public static synchronized SortedSet<String> getDetectedTypes() throws FileTypeDetectorInitException {
67  TreeSet<String> detectedTypes = new TreeSet<>((String string1, String string2) -> {
68  int result = String.CASE_INSENSITIVE_ORDER.compare(string1, string2);
69  if (result == 0) {
70  result = string1.compareTo(string2);
71  }
72  return result;
73  });
74  detectedTypes.addAll(FileTypeDetector.getTikaDetectedTypes());
75  try {
76  for (FileType fileType : CustomFileTypesManager.getInstance().getAutopsyDefinedFileTypes()) {
77  detectedTypes.add(fileType.getMimeType());
78  }
79  } catch (CustomFileTypesManager.CustomFileTypesException ex) {
80  throw new FileTypeDetectorInitException("Error loading Autopsy custom file types", ex);
81  }
82  try {
83  for (FileType fileType : CustomFileTypesManager.getInstance().getUserDefinedFileTypes()) {
84  detectedTypes.add(fileType.getMimeType());
85  }
86  } catch (CustomFileTypesManager.CustomFileTypesException ex) {
87  throw new FileTypeDetectorInitException("Error loading user custom file types", ex);
88  }
89  return detectedTypes;
90  }
91 
99  private static SortedSet<String> getTikaDetectedTypes() {
100  if (null == tikaDetectedTypes) {
101  tikaDetectedTypes = org.apache.tika.mime.MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry().getTypes()
102  .stream().filter(t -> !t.hasParameters()).map(s -> s.toString().replace("tika-", "")).collect(Collectors.toCollection(TreeSet::new));
103  }
104  return Collections.unmodifiableSortedSet(tikaDetectedTypes);
105  }
106 
120  try {
121  userDefinedFileTypes = CustomFileTypesManager.getInstance().getUserDefinedFileTypes();
122  autopsyDefinedFileTypes = CustomFileTypesManager.getInstance().getAutopsyDefinedFileTypes();
123  } catch (CustomFileTypesManager.CustomFileTypesException ex) {
124  throw new FileTypeDetectorInitException("Error loading custom file types", ex); //NON-NLS
125  }
126  }
127 
136  public boolean isDetectable(String mimeType) {
137  return isDetectableAsCustomType(userDefinedFileTypes, mimeType)
138  || isDetectableAsCustomType(autopsyDefinedFileTypes, mimeType)
139  || isDetectableByTika(mimeType);
140  }
141 
151  private boolean isDetectableAsCustomType(List<FileType> customTypes, String mimeType) {
152  for (FileType fileType : customTypes) {
153  if (fileType.getMimeType().equals(mimeType)) {
154  return true;
155  }
156  }
157  return false;
158  }
159 
167  private boolean isDetectableByTika(String mimeType) {
169  }
170 
182  public String getMIMEType(AbstractFile file) {
183  /*
184  * Check to see if the file has already been typed.
185  */
186  String mimeType = file.getMIMEType();
187  if (null != mimeType) {
188  // We remove the optional parameter to allow this method to work
189  // with legacy databases that may contain MIME types with the
190  // optional parameter attached.
191  return removeOptionalParameter(mimeType);
192  }
193  /*
194  * Mark non-regular files (refer to TskData.TSK_FS_META_TYPE_ENUM),
195  * zero-sized files, unallocated space, and unused blocks (refer to
196  * TskData.TSK_DB_FILES_TYPE_ENUM) as octet-stream.
197  */
198  if (!file.isFile() || file.getSize() <= 0
199  || (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS)
200  || (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS)
201  || (file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)
202  || ((file.getType() == TskData.TSK_DB_FILES_TYPE_ENUM.SLACK) && file.getSize() < SLACK_FILE_THRESHOLD)) {
203  mimeType = MimeTypes.OCTET_STREAM;
204  }
205 
206  /*
207  * Read in the beginning of the file and store it.
208  */
209  byte[] buf = new byte[defaultBufferSize];
210  int bufLen;
211  try {
212  bufLen = file.read(buf, 0, defaultBufferSize);
213  } catch (TskCoreException ex) {
214  // Proceed for now - the error will likely get logged next time the file is read.
215  bufLen = 0;
216  }
217 
218  /*
219  * If the file is a regular file, give precedence to user-defined custom
220  * file types.
221  */
222  if (null == mimeType) {
223  mimeType = detectUserDefinedType(file, buf, bufLen);
224  }
225 
226  /*
227  * If the file does not match a user-defined type, give precedence to
228  * custom file types defined by Autopsy.
229  */
230  if (null == mimeType) {
231  mimeType = detectAutopsyDefinedType(file, buf, bufLen);
232  }
233 
234  /*
235  * If the file does not match a user-defined type, send the initial
236  * bytes to Tika.
237  */
238  if (null == mimeType) {
239  ReadContentInputStream stream = new ReadContentInputStream(file);
240 
241  try (TikaInputStream tikaInputStream = TikaInputStream.get(stream)) {
242  String tikaType = tika.detect(tikaInputStream);
243 
244  /*
245  * Remove the Tika suffix from the MIME type name.
246  */
247  mimeType = tikaType.replace("tika-", ""); //NON-NLS
248  /*
249  * Remove the optional parameter from the MIME type.
250  */
251  mimeType = removeOptionalParameter(mimeType);
252 
253  /*
254  * If Tika recognizes the file signature, then use the file
255  * name to refine the type. In short, this is to exclude the
256  * mime types that are determined solely by file extension.
257  * More details in JIRA-4871.
258  */
259  if (!mimeType.equals(MimeTypes.OCTET_STREAM)) {
260  ReadContentInputStream secondPassStream = new ReadContentInputStream(file);
261  try (TikaInputStream secondPassTikaStream = TikaInputStream.get(secondPassStream)) {
262  tikaType = tika.detect(secondPassTikaStream, file.getName());
263  mimeType = tikaType.replace("tika-", ""); //NON-NLS
264  mimeType = removeOptionalParameter(mimeType);
265  }
266  } else {
267  /*
268  * If the file was marked as an octet stream and the extension is .txt, try to detect a text
269  * encoding
270  */
271  if (file.getNameExtension().equals("txt")) {
272  Charset detectedCharset = EncodingUtils.getEncoding(file);
273  if (detectedCharset != EncodingUtils.UNKNOWN_CHARSET) {
274  mimeType = MimeTypes.PLAIN_TEXT;
275  }
276  }
277  }
278 
284  if (mimeType.contains("audio/mpeg")) {
285  try {
286  byte[] header = getNBytes(file, 0, 2);
287  if (byteIs0xFF(header[0]) && byteIs0xFF(header[1])) {
288  mimeType = MimeTypes.OCTET_STREAM;
289  }
290  } catch (TskCoreException ex) {
291  //Oh well, the mimetype is what it is.
292  logger.log(Level.WARNING, String.format("Could not verify audio/mpeg mimetype for file %s with id=%d", file.getName(), file.getId()), ex);
293  }
294  }
295  } catch (Exception ignored) {
296  /*
297  * This exception is swallowed and not logged rather than
298  * propagated because files in data sources are not always
299  * consistent with their file system metadata, making for read
300  * errors. Also, Tika can be a bit flaky at times, making this a
301  * best effort endeavor. Default to octet-stream.
302  */
303  mimeType = MimeTypes.OCTET_STREAM;
304  }
305  }
306 
307  /*
308  * Documented side effect: write the result to the AbstractFile object.
309  */
310  file.setMIMEType(mimeType);
311 
312  return mimeType;
313  }
314 
323  private boolean byteIs0xFF(byte x) {
324  return (x & 0x0F) == 0x0F && (x & 0xF0) == 0xF0;
325  }
326 
338  private byte[] getNBytes(AbstractFile file, int offset, int n) throws TskCoreException {
339  byte[] headerCache = new byte[n];
340  file.read(headerCache, offset, n);
341  return headerCache;
342  }
343 
351  private String removeOptionalParameter(String mimeType) {
352  int indexOfSemicolon = mimeType.indexOf(';');
353  if (indexOfSemicolon != -1) {
354  return mimeType.substring(0, indexOfSemicolon).trim();
355  } else {
356  return mimeType;
357  }
358  }
359 
369  private String detectUserDefinedType(AbstractFile file, byte[] startOfFileBuffer, int bufLen) {
370  String retValue = null;
371 
372  for (FileType fileType : userDefinedFileTypes) {
373  if (fileType.matches(file, startOfFileBuffer, bufLen)) {
374  retValue = fileType.getMimeType();
375  break;
376  }
377  }
378  return retValue;
379  }
380 
391  private String detectAutopsyDefinedType(AbstractFile file, byte[] startOfFileBuffer, int bufLen) {
392  for (FileType fileType : autopsyDefinedFileTypes) {
393  if (fileType.matches(file, startOfFileBuffer, bufLen)) {
394  return fileType.getMimeType();
395  }
396  }
397  return null;
398  }
399 
400  /*
401  * Exception thrown if an initialization error occurs, e.g., user-defined
402  * file type definitions exist but cannot be loaded.
403  */
404  public static class FileTypeDetectorInitException extends Exception {
405 
406  private static final long serialVersionUID = 1L;
407 
414  FileTypeDetectorInitException(String message) {
415  super(message);
416  }
417 
425  FileTypeDetectorInitException(String message, Throwable throwable) {
426  super(message, throwable);
427  }
428 
429  }
430 
439  @Deprecated
440  public List<String> getUserDefinedTypes() {
441  List<String> customFileTypes = new ArrayList<>();
442  userDefinedFileTypes.forEach((fileType) -> {
443  customFileTypes.add(fileType.getMimeType());
444  });
445  autopsyDefinedFileTypes.forEach((fileType) -> {
446  customFileTypes.add(fileType.getMimeType());
447  });
448  return customFileTypes;
449  }
450 
465  @Deprecated
466  public String detectAndPostToBlackboard(AbstractFile file) throws TskCoreException {
467  String fileType = getMIMEType(file);
468  file.setMIMEType(fileType);
469  file.save();
470  return fileType;
471  }
472 
489  @Deprecated
490  public String getFileType(AbstractFile file) throws TskCoreException {
491  String fileType = getMIMEType(file);
492  file.setMIMEType(fileType);
493  file.save();
494  return fileType;
495  }
496 
509  @Deprecated
510  public String detect(AbstractFile file) throws TskCoreException {
511  String fileType = getMIMEType(file);
512  return fileType;
513  }
514 
515 }
String detectAutopsyDefinedType(AbstractFile file, byte[] startOfFileBuffer, int bufLen)
byte[] getNBytes(AbstractFile file, int offset, int n)
boolean isDetectableAsCustomType(List< FileType > customTypes, String mimeType)
String detectUserDefinedType(AbstractFile file, byte[] startOfFileBuffer, int bufLen)
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
static synchronized SortedSet< String > getDetectedTypes()

Copyright © 2012-2022 Basis Technology. Generated on: Tue Jun 27 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.