19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.charset.Charset;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
28 import java.util.logging.Level;
38 class StringsTextExtractor
implements TextExtractor {
40 private static Ingester ingester;
41 private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
42 private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
44 private static final int BOM_LEN = 0;
45 private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
46 private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
47 private KeywordSearchIngestModule module;
48 private AbstractFile sourceFile;
49 private int numChunks = 0;
50 private final List<SCRIPT> extractScripts =
new ArrayList<>();
51 private Map<String, String> extractOptions =
new HashMap<>();
60 public StringsTextExtractor(KeywordSearchIngestModule module) {
62 ingester = Server.getIngester();
63 extractScripts.add(DEFAULT_SCRIPT);
67 public boolean setScripts(List<SCRIPT> extractScripts) {
68 this.extractScripts.clear();
69 this.extractScripts.addAll(extractScripts);
74 public List<SCRIPT> getScripts() {
75 return new ArrayList<>(extractScripts);
79 public int getNumChunks() {
80 return this.numChunks;
84 public AbstractFile getSourceFile() {
89 public Map<String, String> getOptions() {
90 return extractOptions;
94 public void setOptions(Map<String, String> options) {
95 this.extractOptions = options;
99 public boolean index(AbstractFile sourceFile)
throws IngesterException {
100 this.sourceFile = sourceFile;
102 boolean success =
false;
105 final boolean extractUTF8 =
106 Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
108 final boolean extractUTF16 =
109 Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
111 if (extractUTF8 ==
false && extractUTF16 ==
false) {
116 InputStream stringStream;
118 if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
120 stringStream =
new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
122 stringStream =
new AbstractFileStringIntStream(
123 sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
131 final byte[] stringChunkBuf =
new byte[(int) MAX_STRING_CHUNK_SIZE];
133 while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (
int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
137 AbstractFileChunk chunk =
new AbstractFileChunk(
this, this.numChunks + 1);
140 chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
142 }
catch (IngesterException ingEx) {
144 logger.log(Level.WARNING,
"Ingester had a problem with extracted strings from file '" + sourceFile.getName() +
"' (id: " + sourceFile.getId() +
").", ingEx);
153 ingester.ingest(
this);
155 }
catch (IOException ex) {
156 logger.log(Level.WARNING,
"Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex);
160 stringStream.close();
161 }
catch (IOException ex) {
162 logger.log(Level.WARNING,
"Error closing input stream stream, file: " + sourceFile.getName(), ex);
171 public boolean isContentTypeSpecific() {
176 public boolean isSupported(AbstractFile file, String detectedFormat) {