Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
TikaTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2013 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collections;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.concurrent.ExecutorService;
32 import java.util.concurrent.Executors;
33 import java.util.concurrent.Future;
34 import java.util.concurrent.TimeUnit;
35 import java.util.concurrent.TimeoutException;
36 import java.util.logging.Level;
37 
38 import org.openide.util.NbBundle;
42 import org.apache.tika.Tika;
43 import org.apache.tika.metadata.Metadata;
44 import org.apache.tika.mime.MediaType;
45 import org.apache.tika.parser.ParseContext;
48 
59 class TikaTextExtractor implements TextExtractor {
60 
61  private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
62  private static Ingester ingester;
63  private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
64  private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
65  private static final int SINGLE_READ_CHARS = 1024;
66  private static final int EXTRA_CHARS = 128; //for whitespace
67  //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
68  private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
69  private KeywordSearchIngestModule module;
70  private AbstractFile sourceFile; //currently processed file
71  private int numChunks = 0;
72  private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
73  private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
74 
75  TikaTextExtractor(KeywordSearchIngestModule module) {
76  this.module = module;
77  ingester = Server.getIngester();
78 
79  Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
80  for (MediaType mt : mediaTypes) {
81  TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
82  }
83  //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
84  }
85 
86  @Override
87  public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
88  return false;
89  }
90 
91  @Override
92  public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
93  return null;
94  }
95 
96  @Override
97  public Map<String, String> getOptions() {
98  return null;
99  }
100 
101  @Override
102  public void setOptions(Map<String, String> options) {
103  }
104 
105  @Override
106  public int getNumChunks() {
107  return numChunks;
108  }
109 
110  @Override
111  public AbstractFile getSourceFile() {
112  return sourceFile;
113  }
114 
115  @Override
116  public boolean index(AbstractFile sourceFile) throws Ingester.IngesterException {
117  this.sourceFile = sourceFile;
118  numChunks = 0; //unknown until indexing is done
119 
120  boolean success = false;
121  Reader reader = null;
122  final InputStream stream = new ReadContentInputStream(sourceFile);
123  try {
124  Metadata meta = new Metadata();
125 
126  //Parse the file in a task
127  Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
128  ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
129  final Future<?> future = tikaParseExecutor.submit(parseTask);
130  try {
131  future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
132  } catch (TimeoutException te) {
133  final String msg = NbBundle.getMessage(this.getClass(),
134  "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
135  sourceFile.getId(), sourceFile.getName());
136  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
137  logger.log(Level.WARNING, msg);
138  throw new IngesterException(msg);
139  } catch (Exception ex) {
140  final String msg = NbBundle.getMessage(this.getClass(),
141  "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
142  sourceFile.getId(), sourceFile.getName());
143  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
144  logger.log(Level.WARNING, msg);
145  throw new IngesterException(msg);
146  }
147 
148  // get the reader with the results
149  reader = parseTask.getReader();
150  if (reader == null) {
151  //likely due to exception in parse()
152  logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
153  return false;
154  }
155 
156 
157  // break the results into chunks and index
158  success = true;
159  long readSize;
160  long totalRead = 0;
161  boolean eof = false;
162  //we read max 1024 chars at time, this seems to max what this Reader would return
163  while (!eof) {
164  readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
165  if (readSize == -1) {
166  eof = true;
167  }
168  else {
169  totalRead += readSize;
170  }
171  //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
172  while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
173  && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
174  totalRead += readSize;
175  }
176  if (readSize == -1) {
177  //this is the last chunk
178  eof = true;
179  } else {
180  //try to read char-by-char until whitespace to not break words
181  while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
182  && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
183  && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
184  totalRead += readSize;
185  }
186  if (readSize == -1) {
187  //this is the last chunk
188  eof = true;
189  }
190  }
191 
192  //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
193  //encode to bytes to index as byte stream
194  String extracted;
195  //add BOM and trim the 0 bytes
196  //set initial size to chars read + bom + metadata (roughly) - try to prevent from resizing
197  StringBuilder sb = new StringBuilder((int) totalRead + 1000);
198  //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
199  //sb.append(UTF16BOM); disabled prepending of BOM
200  if (totalRead < MAX_EXTR_TEXT_CHARS) {
201  sb.append(textChunkBuf, 0, (int) totalRead);
202  } else {
203  sb.append(textChunkBuf);
204  }
205 
206  //reset for next chunk
207  totalRead = 0;
208 
209  //append meta data if last chunk
210  if (eof) {
211  //sort meta data keys
212  List<String> sortedKeyList = Arrays.asList(meta.names());
213  Collections.sort(sortedKeyList);
214  sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
215  for (String key : sortedKeyList) {
216  String value = meta.get(key);
217  sb.append(key).append(": ").append(value).append("\n");
218  }
219  }
220 
221  extracted = sb.toString();
222 
223 
224  //converts BOM automatically to charSet encoding
225  byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
226  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
227  try {
228  chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
229  ++this.numChunks;
230  } catch (Ingester.IngesterException ingEx) {
231  success = false;
232  logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
233  + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
234  throw ingEx; //need to rethrow/return to signal error and move on
235  }
236  }
237  } catch (IOException ex) {
238  final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
239  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
240  logger.log(Level.WARNING, msg);
241  success = false;
242  } catch (Exception ex) {
243  final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
244  KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
245  logger.log(Level.WARNING, msg);
246  success = false;
247  } finally {
248  try {
249  stream.close();
250  } catch (IOException ex) {
251  logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
252  }
253  try {
254  if (reader != null) {
255  reader.close();
256  }
257  } catch (IOException ex) {
258  logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
259  }
260  }
261 
262  //after all chunks, ingest the parent file without content itself, and store numChunks
263  ingester.ingest(this);
264 
265  return success;
266  }
267 
268  @Override
269  public boolean isContentTypeSpecific() {
270  return true;
271  }
272 
273  @Override
274  public boolean isSupported(AbstractFile file, String detectedFormat) {
275  if (detectedFormat == null) {
276  return false;
277  } else if (detectedFormat.equals("application/octet-stream") //NON-NLS
278  || detectedFormat.equals("application/x-msdownload")) { //NON-NLS
279  //any binary unstructured blobs (string extraction will be used)
280  return false;
281  } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
282  return false;
283  } //skip video other than flv (tika supports flv only)
284  else if (detectedFormat.contains("video/") //NON-NLS
285  && !detectedFormat.equals("video/x-flv")) { //NON-NLS
286  return false;
287  } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
288  // Tika currently has a bug in the ttf parser in fontbox.
289  // It will throw an out of memory exception
290  return false;
291  }
292 
293 
294  //TODO might need to add more mime-types to ignore
295 
296  //then accept all formats supported by Tika
297  return TIKA_SUPPORTED_TYPES.contains(detectedFormat);
298 
299  }
300 
305  private static class ParseRequestTask implements Runnable {
306 
307  //in
308  private Tika tika;
309  private InputStream stream;
310  private Metadata meta;
312  //out
313  private Reader reader;
314 
315  ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
316  this.tika = tika;
317  this.stream = stream;
318  this.meta = meta;
319  this.sourceFile = sourceFile;
320  }
321 
322  @Override
323  public void run() {
324  try {
325  reader = tika.parse(stream, meta);
326  } catch (IOException ex) {
327  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
328  tika = null;
329  reader = null;
330  } catch (Exception ex) {
331  KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
332  tika = null;
333  reader = null;
334  }
335  }
336 
337  public Reader getReader() {
338  return reader;
339  }
340  }
341 }

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.