Autopsy  4.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringsTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.charset.Charset;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
32 import org.sleuthkit.datamodel.AbstractFile;
33 
38 class StringsTextExtractor implements TextExtractor {
39 
40  private static Ingester ingester;
41  private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
42  private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
43  //private static final int BOM_LEN = 3;
44  private static final int BOM_LEN = 0; //disabled prepending of BOM
45  private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
46  private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
47  private KeywordSearchIngestModule module;
48  private AbstractFile sourceFile;
49  private int numChunks = 0;
50  private final List<SCRIPT> extractScripts = new ArrayList<>();
51  private Map<String, String> extractOptions = new HashMap<>();
52 
53  //disabled prepending of BOM
54  //static {
55  //prepend UTF-8 BOM to start of the buffer
56  //stringChunkBuf[0] = (byte) 0xEF;
57  //stringChunkBuf[1] = (byte) 0xBB;
58  //stringChunkBuf[2] = (byte) 0xBF;
59  //}
60  public StringsTextExtractor(KeywordSearchIngestModule module) {
61  this.module = module;
62  ingester = Server.getIngester();
63  extractScripts.add(DEFAULT_SCRIPT);
64  }
65 
66  @Override
67  public boolean setScripts(List<SCRIPT> extractScripts) {
68  this.extractScripts.clear();
69  this.extractScripts.addAll(extractScripts);
70  return true;
71  }
72 
73  @Override
74  public List<SCRIPT> getScripts() {
75  return new ArrayList<>(extractScripts);
76  }
77 
78  @Override
79  public int getNumChunks() {
80  return this.numChunks;
81  }
82 
83  @Override
84  public AbstractFile getSourceFile() {
85  return sourceFile;
86  }
87 
88  @Override
89  public Map<String, String> getOptions() {
90  return extractOptions;
91  }
92 
93  @Override
94  public void setOptions(Map<String, String> options) {
95  this.extractOptions = options;
96  }
97 
98  @Override
99  public boolean index(AbstractFile sourceFile) throws IngesterException {
100  this.sourceFile = sourceFile;
101  this.numChunks = 0; //unknown until indexing is done
102  boolean success = false;
103 
104  final boolean extractUTF8
105  = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));
106 
107  final boolean extractUTF16
108  = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));
109 
110  if (extractUTF8 == false && extractUTF16 == false) {
111  //nothing to do
112  return true;
113  }
114 
115  InputStream stringStream;
116  //check which extract stream to use
117  if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
118  //optimal for english, english only
119  stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
120  } else {
121  stringStream = new AbstractFileStringIntStream(
122  sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
123  }
124 
125  try {
126  success = true;
127  //break input stream into chunks
128 
129  final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
130  long readSize;
131  while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
132  //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
133  //debug.write(stringChunkBuf, 0, (int)readSize);
134 
135  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
136 
137  try {
138  chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
139  ++this.numChunks;
140  } catch (IngesterException ingEx) {
141  success = false;
142  logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
143  throw ingEx; //need to rethrow/return to signal error and move on
144  }
145 
146  //debug.close();
147  }
148 
149  //after all chunks, ingest the parent file without content itself, and store numChunks
150  ingester.ingest(this);
151 
152  } catch (IOException ex) {
153  logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
154  success = false;
155  } finally {
156  try {
157  stringStream.close();
158  } catch (IOException ex) {
159  logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
160  }
161  }
162 
163  return success;
164  }
165 
166  @Override
167  public boolean isContentTypeSpecific() {
168  return true;
169  }
170 
171  @Override
172  public boolean isSupported(AbstractFile file, String detectedFormat) {
173  // strings can be run on anything.
174  return true;
175  }
176 }

Copyright © 2012-2015 Basis Technology. Generated on: Wed Apr 6 2016
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.