19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.Arrays;
26 import java.util.List;
28 import java.util.logging.Level;
40 class HtmlTextExtractor
implements TextExtractor {
42 private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
43 private static Ingester ingester;
44 static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;
45 static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
46 private static final int SINGLE_READ_CHARS = 1024;
47 private static final int EXTRA_CHARS = 128;
48 private static final int MAX_SIZE = 50000000;
50 private final char[] textChunkBuf =
new char[MAX_EXTR_TEXT_CHARS];
51 private KeywordSearchIngestModule module;
52 private AbstractFile sourceFile;
53 private int numChunks = 0;
55 static final List<String> WEB_MIME_TYPES = Arrays.asList(
56 "application/javascript",
57 "application/xhtml+xml",
66 HtmlTextExtractor(KeywordSearchIngestModule module) {
68 ingester = Server.getIngester();
72 public boolean setScripts(List<SCRIPT> extractScripts) {
77 public List<SCRIPT> getScripts() {
82 public Map<String, String> getOptions() {
87 public void setOptions(Map<String, String> options) {
91 public int getNumChunks() {
96 public AbstractFile getSourceFile() {
101 public boolean index(AbstractFile sourceFile)
throws IngesterException {
102 this.sourceFile = sourceFile;
105 boolean success =
false;
106 Reader reader = null;
108 final InputStream stream =
new ReadContentInputStream(sourceFile);
112 JerichoParserWrapper jpw =
new JerichoParserWrapper(stream);
114 reader = jpw.getReader();
117 if (reader == null) {
118 logger.log(Level.WARNING,
"No reader available from HTML parser");
127 while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
128 totalRead += readSize;
131 while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
132 && (readSize = reader.read(textChunkBuf, (
int) totalRead, SINGLE_READ_CHARS)) != -1) {
133 totalRead += readSize;
135 if (readSize == -1) {
140 while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
141 && !Character.isWhitespace(textChunkBuf[(
int) totalRead - 1])
142 && (readSize = reader.read(textChunkBuf, (
int) totalRead, 1)) != -1) {
143 totalRead += readSize;
145 if (readSize == -1) {
157 StringBuilder sb =
new StringBuilder((
int) totalRead + 1000);
160 if (totalRead < MAX_EXTR_TEXT_CHARS) {
161 sb.append(textChunkBuf, 0, (
int) totalRead);
163 sb.append(textChunkBuf);
168 extracted = sb.toString();
171 byte[] encodedBytes = extracted.getBytes(outCharset);
172 AbstractFileChunk chunk =
new AbstractFileChunk(
this, this.numChunks + 1);
174 chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
176 }
catch (Ingester.IngesterException ingEx) {
178 logger.log(Level.WARNING,
"Ingester had a problem with extracted HTML from file '"
179 + sourceFile.getName() +
"' (id: " + sourceFile.getId() +
").", ingEx);
183 }
catch (IOException ex) {
184 logger.log(Level.WARNING,
"Unable to read content stream from " + sourceFile.getId() +
": " + sourceFile.getName(), ex);
186 }
catch (Exception ex) {
187 logger.log(Level.WARNING,
"Unexpected error, can't read content stream from " + sourceFile.getId() +
": " + sourceFile.getName(), ex);
192 }
catch (IOException ex) {
193 logger.log(Level.WARNING,
"Unable to close content stream from " + sourceFile.getId(), ex);
196 if (reader != null) {
199 }
catch (IOException ex) {
200 logger.log(Level.WARNING,
"Unable to close content reader from " + sourceFile.getId(), ex);
205 ingester.ingest(
this);
211 public boolean isContentTypeSpecific() {
216 public boolean isSupported(AbstractFile file, String detectedFormat) {
217 if (detectedFormat == null) {
219 }
else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {