19 package org.sleuthkit.autopsy.coreutils.textutils;
21 import com.ethteck.decodetect.core.Decodetect;
22 import com.ethteck.decodetect.core.DecodetectResult;
23 import java.io.BufferedInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.nio.charset.Charset;
27 import java.nio.charset.CharsetDecoder;
28 import java.nio.charset.CharsetEncoder;
29 import java.util.List;
30 import org.apache.tika.parser.txt.CharsetDetector;
31 import org.apache.tika.parser.txt.CharsetMatch;
63 public boolean contains(Charset cs) {
68 public CharsetDecoder newDecoder() {
73 public CharsetEncoder newEncoder() {
83 public static Charset
getEncoding(AbstractFile file)
throws TskCoreException, IOException {
87 try (InputStream stream =
new BufferedInputStream(
new ReadContentInputStream(file))) {
88 CharsetDetector detector =
new CharsetDetector();
89 detector.setText(stream);
90 CharsetMatch tikaResult = detector.detect();
92 String tikaCharSet = tikaResult.getName();
94 if(Charset.isSupported(tikaCharSet)) {
95 return Charset.forName(tikaCharSet);
103 int maxBytes = 100000;
104 int numBytes = maxBytes;
105 if (file.getSize() < maxBytes) {
106 numBytes = (int) file.getSize();
109 byte[] targetArray =
new byte[numBytes];
110 file.read(targetArray, 0, numBytes);
111 List<DecodetectResult> results = Decodetect.DECODETECT.getResults(targetArray);
112 if (!results.isEmpty()) {
113 DecodetectResult topResult = results.get(0);
115 return topResult.getEncoding();
static final int MIN_CHARSETDETECT_MATCH_CONFIDENCE
static final Charset UNKNOWN_CHARSET
static final double MIN_DECODETECT_MATCH_CONFIDENCE
static Charset getEncoding(AbstractFile file)