19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collections;
28 import java.util.List;
31 import java.util.concurrent.ExecutorService;
32 import java.util.concurrent.Executors;
33 import java.util.concurrent.Future;
34 import java.util.concurrent.TimeUnit;
35 import java.util.concurrent.TimeoutException;
36 import java.util.logging.Level;
38 import org.openide.util.NbBundle;
42 import org.apache.tika.Tika;
43 import org.apache.tika.metadata.Metadata;
44 import org.apache.tika.mime.MediaType;
45 import org.apache.tika.parser.ParseContext;
59 class TikaTextExtractor
implements TextExtractor {
61 private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
62 private static Ingester ingester;
63 private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
64 private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
65 private static final int SINGLE_READ_CHARS = 1024;
66 private static final int EXTRA_CHARS = 128;
68 private final char[] textChunkBuf =
new char[MAX_EXTR_TEXT_CHARS];
69 private KeywordSearchIngestModule module;
70 private AbstractFile sourceFile;
71 private int numChunks = 0;
72 private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
73 private final List<String> TIKA_SUPPORTED_TYPES =
new ArrayList<>();
75 TikaTextExtractor(KeywordSearchIngestModule module) {
77 ingester = Server.getIngester();
79 Set<MediaType> mediaTypes =
new Tika().getParser().getSupportedTypes(
new ParseContext());
80 for (MediaType mt : mediaTypes) {
81 TIKA_SUPPORTED_TYPES.add(mt.getType() +
"/" + mt.getSubtype());
87 public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
92 public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
97 public Map<String, String> getOptions() {
102 public void setOptions(Map<String, String> options) {
106 public int getNumChunks() {
111 public AbstractFile getSourceFile() {
116 public boolean index(AbstractFile sourceFile)
throws Ingester.IngesterException {
117 this.sourceFile = sourceFile;
120 boolean success =
false;
121 Reader reader = null;
122 final InputStream stream =
new ReadContentInputStream(sourceFile);
124 Metadata meta =
new Metadata();
127 Tika tika =
new Tika();
128 ParseRequestTask parseTask =
new ParseRequestTask(tika, stream, meta, sourceFile);
129 final Future<?> future = tikaParseExecutor.submit(parseTask);
131 future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
132 }
catch (TimeoutException te) {
133 final String msg = NbBundle.getMessage(this.getClass(),
134 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
135 sourceFile.getId(), sourceFile.getName());
136 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
137 logger.log(Level.WARNING, msg);
138 throw new IngesterException(msg);
139 }
catch (Exception ex) {
140 final String msg = NbBundle.getMessage(this.getClass(),
141 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
142 sourceFile.getId(), sourceFile.getName());
143 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
144 logger.log(Level.WARNING, msg);
145 throw new IngesterException(msg);
149 reader = parseTask.getReader();
150 if (reader == null) {
152 logger.log(Level.WARNING,
"No reader available from Tika parse");
164 readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
165 if (readSize == -1) {
169 totalRead += readSize;
172 while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
173 && (readSize = reader.read(textChunkBuf, (
int) totalRead, SINGLE_READ_CHARS)) != -1) {
174 totalRead += readSize;
176 if (readSize == -1) {
181 while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
182 && !Character.isWhitespace(textChunkBuf[(
int) totalRead - 1])
183 && (readSize = reader.read(textChunkBuf, (
int) totalRead, 1)) != -1) {
184 totalRead += readSize;
186 if (readSize == -1) {
197 StringBuilder sb =
new StringBuilder((
int) totalRead + 1000);
200 if (totalRead < MAX_EXTR_TEXT_CHARS) {
201 sb.append(textChunkBuf, 0, (
int) totalRead);
203 sb.append(textChunkBuf);
212 List<String> sortedKeyList = Arrays.asList(meta.names());
213 Collections.sort(sortedKeyList);
214 sb.append(
"\n\n------------------------------METADATA------------------------------\n\n");
215 for (String key : sortedKeyList) {
216 String value = meta.get(key);
217 sb.append(key).append(
": ").append(value).append(
"\n");
221 extracted = sb.toString();
225 byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
226 AbstractFileChunk chunk =
new AbstractFileChunk(
this, this.numChunks + 1);
228 chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
230 }
catch (Ingester.IngesterException ingEx) {
232 logger.log(Level.WARNING,
"Ingester had a problem with extracted strings from file '"
233 + sourceFile.getName() +
"' (id: " + sourceFile.getId() +
").", ingEx);
237 }
catch (IOException ex) {
238 final String msg =
"Exception: Unable to read Tika content stream from " + sourceFile.getId() +
": " + sourceFile.getName();
239 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
240 logger.log(Level.WARNING, msg);
242 }
catch (Exception ex) {
243 final String msg =
"Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() +
": " + sourceFile.getName();
244 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
245 logger.log(Level.WARNING, msg);
250 }
catch (IOException ex) {
251 logger.log(Level.WARNING,
"Unable to close Tika content stream from " + sourceFile.getId(), ex);
254 if (reader != null) {
257 }
catch (IOException ex) {
258 logger.log(Level.WARNING,
"Unable to close content reader from " + sourceFile.getId(), ex);
263 ingester.ingest(
this);
269 public boolean isContentTypeSpecific() {
274 public boolean isSupported(AbstractFile file, String detectedFormat) {
275 if (detectedFormat == null) {
277 }
else if (detectedFormat.equals(
"application/octet-stream")
278 || detectedFormat.equals(
"application/x-msdownload")) {
281 }
else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
284 else if (detectedFormat.contains(
"video/")
285 && !detectedFormat.equals(
"video/x-flv")) {
287 }
else if (detectedFormat.contains(
"application/x-font-ttf")) {
297 return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
325 reader = tika.parse(stream, meta);
326 }
catch (IOException ex) {
327 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + sourceFile.
getId() +
": " + sourceFile.
getName(), ex);
330 }
catch (Exception ex) {
331 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + sourceFile.
getId() +
": " + sourceFile.
getName(), ex);