19 package org.sleuthkit.autopsy.coreutils.textutils;
21 import com.ethteck.decodetect.core.Decodetect;
22 import com.ethteck.decodetect.core.DecodetectResult;
23 import java.io.BufferedInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.nio.charset.Charset;
27 import java.nio.charset.CharsetDecoder;
28 import java.nio.charset.CharsetEncoder;
29 import java.util.List;
30 import org.apache.tika.parser.txt.CharsetDetector;
31 import org.apache.tika.parser.txt.CharsetMatch;
63 public boolean contains(Charset cs) {
68 public CharsetDecoder newDecoder() {
73 public CharsetEncoder newEncoder() {
83 public static Charset
getEncoding(AbstractFile file)
throws TskCoreException, IOException {
87 try (InputStream stream =
new BufferedInputStream(
new ReadContentInputStream(file))) {
88 CharsetDetector detector =
new CharsetDetector();
89 detector.setText(stream);
91 CharsetMatch[] tikaResults = detector.detectAll();
94 if (tikaResults.length > 0) {
95 CharsetMatch topPick = tikaResults[0];
97 if (topPick.getName().equalsIgnoreCase(
"IBM500") && tikaResults.length > 1) {
103 topPick = tikaResults[1];
106 if (!topPick.getName().equalsIgnoreCase(
"IBM500") &&
107 topPick.getConfidence() >= MIN_CHARSETDETECT_MATCH_CONFIDENCE &&
108 Charset.isSupported(topPick.getName())) {
111 return Charset.forName(topPick.getName());
119 int maxBytes = 100000;
120 int numBytes = maxBytes;
121 if (file.getSize() < maxBytes) {
122 numBytes = (int) file.getSize();
125 byte[] targetArray =
new byte[numBytes];
126 file.read(targetArray, 0, numBytes);
127 List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
128 if (!results.isEmpty()) {
129 DecodetectResult topResult = results.get(0);
131 return topResult.getEncoding();
static final int MIN_CHARSETDETECT_MATCH_CONFIDENCE
static final Charset UNKNOWN_CHARSET
static final double MIN_DECODETECT_MATCH_CONFIDENCE
static Charset getEncoding(AbstractFile file)