Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
HtmlTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2013 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.Arrays;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
34 
40  class HtmlTextExtractor implements TextExtractor {
41 
42  private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
43  private static Ingester ingester;
44  static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
45  static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
46  private static final int SINGLE_READ_CHARS = 1024;
47  private static final int EXTRA_CHARS = 128; //for whitespace
48  private static final int MAX_SIZE = 50000000;
49  //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
50  private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
51  private KeywordSearchIngestModule module;
52  private AbstractFile sourceFile;
53  private int numChunks = 0;
54 
55  static final List<String> WEB_MIME_TYPES = Arrays.asList(
56  "application/javascript", //NON-NLS
57  "application/xhtml+xml", //NON-NLS
58  "application/json", //NON-NLS
59  "text/css", //NON-NLS
60  "text/html", //NON-NLS NON-NLS
61  "text/javascript" //NON-NLS
62  //"application/xml",
63  //"application/xml-dtd",
64  );
65 
66  HtmlTextExtractor(KeywordSearchIngestModule module) {
67  this.module = module;
68  ingester = Server.getIngester();
69  }
70 
71  @Override
72  public boolean setScripts(List<SCRIPT> extractScripts) {
73  return false;
74  }
75 
76  @Override
77  public List<SCRIPT> getScripts() {
78  return null;
79  }
80 
81  @Override
82  public Map<String, String> getOptions() {
83  return null;
84  }
85 
86  @Override
87  public void setOptions(Map<String, String> options) {
88  }
89 
90  @Override
91  public int getNumChunks() {
92  return numChunks;
93  }
94 
95  @Override
96  public AbstractFile getSourceFile() {
97  return sourceFile;
98  }
99 
100  @Override
101  public boolean index(AbstractFile sourceFile) throws IngesterException {
102  this.sourceFile = sourceFile;
103  numChunks = 0; //unknown until indexing is done
104 
105  boolean success = false;
106  Reader reader = null;
107 
108  final InputStream stream = new ReadContentInputStream(sourceFile);
109 
110  try {
111  // Parse the stream with Jericho
112  JerichoParserWrapper jpw = new JerichoParserWrapper(stream);
113  jpw.parse();
114  reader = jpw.getReader();
115 
116  // In case there is an exception or parse() isn't called
117  if (reader == null) {
118  logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS
119  return false;
120  }
121 
122  success = true;
123  long readSize;
124  long totalRead = 0;
125  boolean eof = false;
126  //we read max 1024 chars at time, this seems to max what this Reader would return
127  while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
128  totalRead += readSize;
129 
130  //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
131  while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
132  && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
133  totalRead += readSize;
134  }
135  if (readSize == -1) {
136  //this is the last chunk
137  eof = true;
138  } else {
139  //try to read until whitespace to not break words
140  while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
141  && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
142  && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
143  totalRead += readSize;
144  }
145  if (readSize == -1) {
146  //this is the last chunk
147  eof = true;
148  }
149  }
150 
151  //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
152  //encode to bytes to index as byte stream
153  String extracted;
154 
155  //add BOM and trim the 0 bytes
156  //set initial size to chars read + bom - try to prevent from resizing
157  StringBuilder sb = new StringBuilder((int) totalRead + 1000);
158  //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
159  //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
160  if (totalRead < MAX_EXTR_TEXT_CHARS) {
161  sb.append(textChunkBuf, 0, (int) totalRead);
162  } else {
163  sb.append(textChunkBuf);
164  }
165 
166  //reset for next chunk
167  totalRead = 0;
168  extracted = sb.toString();
169 
170  //converts BOM automatically to charSet encoding
171  byte[] encodedBytes = extracted.getBytes(outCharset);
172  AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
173  try {
174  chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
175  ++this.numChunks;
176  } catch (Ingester.IngesterException ingEx) {
177  success = false;
178  logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS
179  + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
180  throw ingEx; //need to rethrow/return to signal error and move on
181  }
182  }
183  } catch (IOException ex) {
184  logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
185  success = false;
186  } catch (Exception ex) {
187  logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
188  success = false;
189  } finally {
190  try {
191  stream.close();
192  } catch (IOException ex) {
193  logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
194  }
195  try {
196  if (reader != null) {
197  reader.close();
198  }
199  } catch (IOException ex) {
200  logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
201  }
202  }
203 
204  //after all chunks, ingest the parent file without content itself, and store numChunks
205  ingester.ingest(this);
206 
207  return success;
208  }
209 
210  @Override
211  public boolean isContentTypeSpecific() {
212  return true;
213  }
214 
215  @Override
216  public boolean isSupported(AbstractFile file, String detectedFormat) {
217  if (detectedFormat == null) {
218  return false;
219  } else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {
220  return true;
221  } else {
222  return false;
223  }
224 
225  }
226 }

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.