Autopsy  4.17.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
EncodingUtils.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2020-2020 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.coreutils.textutils;
20 
21 import com.ethteck.decodetect.core.Decodetect;
22 import com.ethteck.decodetect.core.DecodetectResult;
23 import java.io.BufferedInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.nio.charset.Charset;
27 import java.nio.charset.CharsetDecoder;
28 import java.nio.charset.CharsetEncoder;
29 import java.util.List;
30 import org.apache.tika.parser.txt.CharsetDetector;
31 import org.apache.tika.parser.txt.CharsetMatch;
32 import org.sleuthkit.datamodel.AbstractFile;
33 import org.sleuthkit.datamodel.ReadContentInputStream;
34 import org.sleuthkit.datamodel.TskCoreException;
35 
39 public class EncodingUtils {
40 
41  // This value will be used as a threshold for determining which encoding
42  // detection library to use. If CharsetDetector's own confidence is at least
43  // MIN_MATCH_CONFIDENCE, CharsetDetector's result will be used for decoding.
44  // Otherwise, Decodetect will be used.
45  //
46  // Note: We initially used a confidence of 35, but it was causing some
47  // Chrome Cache files to get flagged as UTF-16 with confidence 40.
48  // These files had a small amount of binary data and then ASCII.
49  static final private int MIN_CHARSETDETECT_MATCH_CONFIDENCE = 41;
50 
51  // This value determines whether we will consider Decodetect's top-scoring
52  // result a legitimate match or if we will disregard its findings.
53  //
54  // Possible values are 0 to 1, inclusive.
55  static final private double MIN_DECODETECT_MATCH_CONFIDENCE = 0.4;
56 
57  /*
58  * The char set returned if the algorithm fails to detect the
59  * encoding of the file.
60  */
61  public static final Charset UNKNOWN_CHARSET = new Charset("unknown", null) {
62  @Override
63  public boolean contains(Charset cs) {
64  return false;
65  }
66 
67  @Override
68  public CharsetDecoder newDecoder() {
69  return null;
70  }
71 
72  @Override
73  public CharsetEncoder newEncoder() {
74  return null;
75  }
76  };
77 
83  public static Charset getEncoding(AbstractFile file) throws TskCoreException, IOException {
84  // Encoding detection is hard. We use several libraries since the data passed in is often messy.
85  // First try CharsetDetector (from Tika / ICU4J).
86  // It is a rule-based detection approach.
87  try (InputStream stream = new BufferedInputStream(new ReadContentInputStream(file))) {
88  CharsetDetector detector = new CharsetDetector();
89  detector.setText(stream);
90 
91  CharsetMatch[] tikaResults = detector.detectAll();
92  // Get all guesses by Tika. These matches are ordered
93  // by descending confidence (largest first).
94  if (tikaResults.length > 0) {
95  CharsetMatch topPick = tikaResults[0];
96 
97  if (topPick.getName().equalsIgnoreCase("IBM500") && tikaResults.length > 1) {
98  // Legacy encoding, let's discard this one in favor
99  // of the second pick. Tika has some problems with
100  // mistakenly identifying text as IBM500. See JIRA-6600
101  // and https://issues.apache.org/jira/browse/TIKA-2771 for
102  // more details.
103  topPick = tikaResults[1];
104  }
105 
106  if (!topPick.getName().equalsIgnoreCase("IBM500") &&
107  topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE &&
108  Charset.isSupported(topPick.getName())) {
109  // Choose this charset since it's supported and has high
110  // enough confidence
111  return Charset.forName(topPick.getName());
112  }
113  }
114  }
115 
116  // If that did not work, then use DecoDetect, which is statistical
117  // We needed this for some Japanese text files that were incorrectly detected by CharsetDetector (with low confidence)
118  // This will not always work with messy data that combines some binary and some ASCII.
119  int maxBytes = 100000;
120  int numBytes = maxBytes;
121  if (file.getSize() < maxBytes) {
122  numBytes = (int) file.getSize();
123  }
124 
125  byte[] targetArray = new byte[numBytes];
126  file.read(targetArray, 0, numBytes);
127  List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
128  if (!results.isEmpty()) {
129  DecodetectResult topResult = results.get(0);
130  if (topResult.getConfidence() >= MIN_DECODETECT_MATCH_CONFIDENCE) {
131  return topResult.getEncoding();
132  }
133  }
134 
135  return UNKNOWN_CHARSET;
136  }
137 }

Copyright © 2012-2021 Basis Technology. Generated on: Tue Jan 19 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.