Autopsy  4.12.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
TextFileExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2018-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.BufferedInputStream;
23 import java.io.Reader;
24 import org.apache.tika.parser.txt.CharsetDetector;
25 import org.apache.tika.parser.txt.CharsetMatch;
26 import org.sleuthkit.datamodel.AbstractFile;
27 import org.sleuthkit.datamodel.ReadContentInputStream;
28 
32 final class TextFileExtractor {
33 
34  //Set a Minimum confidence value to reject matches that may not have a valid text encoding
35  //Values of valid text encodings were generally 100, xml code sometimes had a value around 50,
36  //and pictures and other files with a .txt extention were showing up with a value of 5 or less in limited testing.
37  //This limited information was used to select the current value as one that would filter out clearly non-text
38  //files while hopefully working on all files with a valid text encoding
39  static final private int MIN_MATCH_CONFIDENCE = 20;
40 
41  public Reader getReader(AbstractFile source) throws TextFileExtractorException {
42  CharsetDetector detector = new CharsetDetector();
43  //wrap stream in a BufferedInputStream so that it supports the mark/reset methods necessary for the CharsetDetector
44  InputStream stream = new BufferedInputStream(new ReadContentInputStream(source));
45  try {
46  detector.setText(stream);
47  } catch (IOException ex) {
48  throw new TextFileExtractorException("Unable to get string from detected text in TextFileExtractor", ex);
49  }
50  CharsetMatch match = detector.detect();
51  if (match == null) {
52  throw new TextFileExtractorException("Unable to detect any matches using TextFileExtractor");
53  } else if (match.getConfidence() < MIN_MATCH_CONFIDENCE) {
54  throw new TextFileExtractorException("Text does not match any character set with a high enough confidence for TextFileExtractor");
55  }
56 
57  return match.getReader();
58  }
59 
60  public class TextFileExtractorException extends Exception {
61  public TextFileExtractorException(String msg, Throwable ex) {
62  super(msg, ex);
63  }
64  public TextFileExtractorException(String msg) {
65  super(msg);
66  }
67  }
68 }

Copyright © 2012-2018 Basis Technology. Generated on: Wed Sep 18 2019
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.