19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.charset.Charset;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
28 import java.util.logging.Level;
39 class StringsTextExtractor
implements TextExtractor {
41 private static Ingester ingester;
42 private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
43 private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
45 private static final int BOM_LEN = 0;
46 private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
47 private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
48 private AbstractFile sourceFile;
49 private int numChunks = 0;
50 private final List<SCRIPT> extractScripts =
new ArrayList<>();
51 private Map<String, String> extractOptions =
new HashMap<>();
60 public StringsTextExtractor() {
61 ingester = Server.getIngester();
62 extractScripts.add(DEFAULT_SCRIPT);
66 public boolean setScripts(List<SCRIPT> extractScripts) {
67 this.extractScripts.clear();
68 this.extractScripts.addAll(extractScripts);
73 public List<SCRIPT> getScripts() {
74 return new ArrayList<>(extractScripts);
78 public int getNumChunks() {
79 return this.numChunks;
83 public AbstractFile getSourceFile() {
88 public Map<String, String> getOptions() {
89 return extractOptions;
93 public void setOptions(Map<String, String> options) {
94 this.extractOptions = options;
98 public boolean index(AbstractFile sourceFile, IngestJobContext context)
throws IngesterException {
99 this.sourceFile = sourceFile;
101 boolean success =
false;
103 final boolean extractUTF8
104 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
106 final boolean extractUTF16
107 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
109 if (extractUTF8 ==
false && extractUTF16 ==
false) {
114 InputStream stringStream;
116 if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
118 stringStream =
new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
120 stringStream =
new AbstractFileStringIntStream(
121 sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
128 final byte[] stringChunkBuf =
new byte[(int) MAX_STRING_CHUNK_SIZE];
130 while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (
int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
131 if (context.fileIngestIsCancelled()) {
132 ingester.ingest(
this);
138 AbstractFileChunk chunk =
new AbstractFileChunk(
this, this.numChunks + 1);
141 chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
143 }
catch (IngesterException ingEx) {
145 logger.log(Level.WARNING,
"Ingester had a problem with extracted strings from file '" + sourceFile.getName() +
"' (id: " + sourceFile.getId() +
").", ingEx);
153 ingester.ingest(
this);
155 }
catch (IOException ex) {
156 logger.log(Level.WARNING,
"Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex);
160 stringStream.close();
161 }
catch (IOException ex) {
162 logger.log(Level.WARNING,
"Error closing input stream stream, file: " + sourceFile.getName(), ex);
170 public boolean isContentTypeSpecific() {
175 public boolean isSupported(AbstractFile file, String detectedFormat) {