Autopsy  4.10.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
HtmlTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.logging.Level;
27 import net.htmlparser.jericho.Attributes;
28 import net.htmlparser.jericho.Config;
29 import net.htmlparser.jericho.LoggerProvider;
30 import net.htmlparser.jericho.Renderer;
31 import net.htmlparser.jericho.Source;
32 import net.htmlparser.jericho.StartTag;
33 import net.htmlparser.jericho.StartTagType;
35 import org.sleuthkit.datamodel.AbstractFile;
36 import org.sleuthkit.datamodel.ReadContentInputStream;
37 
41 final class HtmlTextExtractor implements TextExtractor {
42 
43  static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
44  private final int MAX_SIZE;
45  private final AbstractFile file;
46 
47  static final List<String> WEB_MIME_TYPES = Arrays.asList(
48  "application/javascript", //NON-NLS
49  "application/xhtml+xml", //NON-NLS
50  "application/json", //NON-NLS
51  "text/css", //NON-NLS
52  "text/html", //NON-NLS NON-NLS
53  "text/javascript" //NON-NLS
54  );
55 
56  static {
57  // Disable Jericho HTML Parser log messages.
58  Config.LoggerProvider = LoggerProvider.DISABLED;
59  }
60 
65  public HtmlTextExtractor(AbstractFile file) {
66  //Set default to be 50 MB.
67  MAX_SIZE = 50_000_000;
68  this.file = file;
69  }
70 
79  @Override
80  public boolean isSupported() {
81  return file.getMIMEType() != null
82  && WEB_MIME_TYPES.contains(file.getMIMEType())
83  && file.getSize() <= MAX_SIZE;
84  }
85 
95  @Override
96  public Reader getReader() throws InitReaderException {
97  //TODO JIRA-4467, there is only harm in excluding HTML documents greater
98  //than 50MB due to our troubled approach of extraction.
99  ReadContentInputStream stream = new ReadContentInputStream(file);
100 
101  //Parse the stream with Jericho and put the results in a Reader
102  try {
103  StringBuilder scripts = new StringBuilder();
104  StringBuilder links = new StringBuilder();
105  StringBuilder images = new StringBuilder();
106  StringBuilder comments = new StringBuilder();
107  StringBuilder others = new StringBuilder();
108  int numScripts = 0;
109  int numLinks = 0;
110  int numImages = 0;
111  int numComments = 0;
112  int numOthers = 0;
113 
114  Source source = new Source(stream);
115  source.fullSequentialParse();
116  Renderer renderer = source.getRenderer();
117  renderer.setNewLine("\n");
118  renderer.setIncludeHyperlinkURLs(false);
119  renderer.setDecorateFontStyles(false);
120  renderer.setIncludeAlternateText(false);
121 
122  String text = renderer.toString();
123  // Get all the tags in the source
124  List<StartTag> tags = source.getAllStartTags();
125 
126  StringBuilder stringBuilder = new StringBuilder();
127  for (StartTag tag : tags) {
128  if (tag.getName().equals("script")) { //NON-NLS
129  // If the <script> tag has attributes
130  numScripts++;
131  scripts.append(numScripts).append(") ");
132  if (tag.getTagContent().length() > 0) {
133  scripts.append(tag.getTagContent()).append(" ");
134  }
135  // Get whats between the <script> .. </script> tags
136  scripts.append(tag.getElement().getContent()).append("\n");
137 
138  } else if (tag.getName().equals("a")) {
139  //NON-NLS
140  numLinks++;
141  links.append(numLinks).append(") ");
142  links.append(tag.getTagContent()).append("\n");
143 
144  } else if (tag.getName().equals("img")) {
145  //NON-NLS
146  numImages++;
147  images.append(numImages).append(") ");
148  images.append(tag.getTagContent()).append("\n");
149 
150  } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
151  numComments++;
152  comments.append(numComments).append(") ");
153  comments.append(tag.getTagContent()).append("\n");
154 
155  } else {
156  // Make sure it has an attribute
157  Attributes atts = tag.getAttributes();
158  if (atts != null && atts.length() > 0) {
159  numOthers++;
160  others.append(numOthers).append(") ");
161  others.append(tag.getName()).append(":");
162  others.append(tag.getTagContent()).append("\n");
163 
164  }
165  }
166  }
167  stringBuilder.append(text).append("\n\n");
168  stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
169  if (numScripts > 0) {
170  stringBuilder.append("---Scripts---\n"); //NON-NLS
171  stringBuilder.append(scripts).append("\n");
172  }
173  if (numLinks > 0) {
174  stringBuilder.append("---Links---\n"); //NON-NLS
175  stringBuilder.append(links).append("\n");
176  }
177  if (numImages > 0) {
178  stringBuilder.append("---Images---\n"); //NON-NLS
179  stringBuilder.append(images).append("\n");
180  }
181  if (numComments > 0) {
182  stringBuilder.append("---Comments---\n"); //NON-NLS
183  stringBuilder.append(comments).append("\n");
184  }
185  if (numOthers > 0) {
186  stringBuilder.append("---Others---\n"); //NON-NLS
187  stringBuilder.append(others).append("\n");
188  }
189  // All done, now make it a reader
190  return new StringReader(stringBuilder.toString());
191  } catch (IOException ex) {
192  logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
193  throw new InitReaderException("Error extracting HTML from content.", ex);
194  }
195  }
196 }

Copyright © 2012-2018 Basis Technology. Generated on: Fri Mar 22 2019
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.