19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collections;
28 import java.util.List;
31 import java.util.concurrent.ExecutorService;
32 import java.util.concurrent.Executors;
33 import java.util.concurrent.Future;
34 import java.util.concurrent.TimeUnit;
35 import java.util.concurrent.TimeoutException;
36 import java.util.logging.Level;
38 import org.openide.util.NbBundle;
42 import org.apache.tika.Tika;
43 import org.apache.tika.metadata.Metadata;
44 import org.apache.tika.mime.MediaType;
45 import org.apache.tika.parser.ParseContext;
59 class TikaTextExtractor
implements TextExtractor {
61 private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
62 private static Ingester ingester;
63 private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
64 private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
65 private static final int SINGLE_READ_CHARS = 1024;
66 private static final int EXTRA_CHARS = 128;
67 private final char[] textChunkBuf =
new char[MAX_EXTR_TEXT_CHARS];
68 private final KeywordSearchIngestModule module;
69 private AbstractFile sourceFile;
70 private int numChunks = 0;
71 private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
72 private final List<String> TIKA_SUPPORTED_TYPES =
new ArrayList<>();
74 TikaTextExtractor(KeywordSearchIngestModule module) {
76 ingester = Server.getIngester();
78 Set<MediaType> mediaTypes =
new Tika().getParser().getSupportedTypes(
new ParseContext());
79 for (MediaType mt : mediaTypes) {
80 TIKA_SUPPORTED_TYPES.add(mt.getType() +
"/" + mt.getSubtype());
86 public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
91 public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
96 public Map<String, String> getOptions() {
101 public void setOptions(Map<String, String> options) {
105 public int getNumChunks() {
110 public AbstractFile getSourceFile() {
115 public boolean index(AbstractFile sourceFile)
throws Ingester.IngesterException {
116 this.sourceFile = sourceFile;
119 boolean success =
false;
120 Reader reader = null;
121 final InputStream stream =
new ReadContentInputStream(sourceFile);
123 Metadata meta =
new Metadata();
126 Tika tika =
new Tika();
127 ParseRequestTask parseTask =
new ParseRequestTask(tika, stream, meta, sourceFile);
128 final Future<?> future = tikaParseExecutor.submit(parseTask);
130 future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
131 }
catch (TimeoutException te) {
132 final String msg = NbBundle.getMessage(this.getClass(),
133 "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
134 sourceFile.getId(), sourceFile.getName());
135 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
136 logger.log(Level.WARNING, msg);
137 throw new IngesterException(msg);
138 }
catch (Exception ex) {
139 final String msg = NbBundle.getMessage(this.getClass(),
140 "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
141 sourceFile.getId(), sourceFile.getName());
142 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
143 logger.log(Level.WARNING, msg);
144 throw new IngesterException(msg);
148 reader = parseTask.getReader();
149 if (reader == null) {
151 logger.log(Level.WARNING,
"No reader available from Tika parse");
162 readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
163 if (readSize == -1) {
166 totalRead += readSize;
169 while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
170 && (readSize = reader.read(textChunkBuf, (
int) totalRead, SINGLE_READ_CHARS)) != -1) {
171 totalRead += readSize;
173 if (readSize == -1) {
178 while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
179 && !Character.isWhitespace(textChunkBuf[(
int) totalRead - 1])
180 && (readSize = reader.read(textChunkBuf, (
int) totalRead, 1)) != -1) {
181 totalRead += readSize;
183 if (readSize == -1) {
190 for (
int i = 0; i < totalRead; ++i) {
191 if (!isValidSolrUTF8(textChunkBuf[i])) {
192 textChunkBuf[i] =
'^';
196 StringBuilder sb =
new StringBuilder((
int) totalRead + 1000);
197 sb.append(textChunkBuf, 0, (
int) totalRead);
205 List<String> sortedKeyList = Arrays.asList(meta.names());
206 Collections.sort(sortedKeyList);
207 sb.append(
"\n\n------------------------------METADATA------------------------------\n\n");
208 for (String key : sortedKeyList) {
209 String value = meta.get(key);
210 sb.append(key).append(
": ").append(value).append(
"\n");
215 byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
216 AbstractFileChunk chunk =
new AbstractFileChunk(
this, this.numChunks + 1);
218 chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
220 }
catch (Ingester.IngesterException ingEx) {
222 logger.log(Level.WARNING,
"Ingester had a problem with extracted strings from file '"
223 + sourceFile.getName() +
"' (id: " + sourceFile.getId() +
").", ingEx);
227 }
catch (IOException ex) {
228 final String msg =
"Exception: Unable to read Tika content stream from " + sourceFile.getId() +
": " + sourceFile.getName();
229 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
230 logger.log(Level.WARNING, msg);
232 }
catch (Exception ex) {
233 final String msg =
"Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() +
": " + sourceFile.getName();
234 KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
235 logger.log(Level.WARNING, msg);
240 }
catch (IOException ex) {
241 logger.log(Level.WARNING,
"Unable to close Tika content stream from " + sourceFile.getId(), ex);
244 if (reader != null) {
247 }
catch (IOException ex) {
248 logger.log(Level.WARNING,
"Unable to close content reader from " + sourceFile.getId(), ex);
253 ingester.ingest(
this);
275 private static boolean isValidSolrUTF8(
char ch) {
276 return ((ch <= 0xFDD0 || ch >= 0xFDEF) && (ch > 0x1F || ch == 0x9 || ch == 0xA || ch == 0xD) && (ch != 0xFFFF) && (ch != 0xFFFE));
280 public boolean isContentTypeSpecific() {
285 public boolean isSupported(AbstractFile file, String detectedFormat) {
286 if (detectedFormat == null) {
288 }
else if (detectedFormat.equals(
"application/octet-stream")
289 || detectedFormat.equals(
"application/x-msdownload")) {
292 }
else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
295 else if (detectedFormat.contains(
"video/")
296 && !detectedFormat.equals(
"video/x-flv")) {
298 }
else if (detectedFormat.contains(
"application/x-font-ttf")) {
306 return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
324 ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
334 reader = tika.parse(stream, meta);
335 }
catch (IOException ex) {
336 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + sourceFile.getId() +
": " + sourceFile.getName(), ex);
339 }
catch (Exception ex) {
340 KeywordSearch.getTikaLogger().log(Level.WARNING,
"Exception: Unable to Tika parse the content" + sourceFile.getId() +
": " + sourceFile.getName(), ex);