19 package org.sleuthkit.autopsy.textreaders;
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.logging.Level;
27 import net.htmlparser.jericho.Attributes;
28 import net.htmlparser.jericho.Config;
29 import net.htmlparser.jericho.LoggerProvider;
30 import net.htmlparser.jericho.Renderer;
31 import net.htmlparser.jericho.Source;
32 import net.htmlparser.jericho.StartTag;
33 import net.htmlparser.jericho.StartTagType;
41 final class HtmlTextExtractor
extends TextExtractor {
43 static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
44 private final int MAX_SIZE;
45 private final Content file;
47 static final List<String> WEB_MIME_TYPES = Arrays.asList(
48 "application/javascript",
49 "application/xhtml+xml",
58 Config.LoggerProvider = LoggerProvider.DISABLED;
65 public HtmlTextExtractor(Content file) {
67 MAX_SIZE = 50_000_000;
80 public boolean isSupported(Content content, String detectedFormat) {
81 return detectedFormat != null
82 && WEB_MIME_TYPES.contains(detectedFormat)
83 && content.getSize() <= MAX_SIZE;
96 public Reader getReader() throws ExtractionException {
99 ReadContentInputStream stream =
new ReadContentInputStream(file);
103 StringBuilder scripts =
new StringBuilder();
104 StringBuilder links =
new StringBuilder();
105 StringBuilder images =
new StringBuilder();
106 StringBuilder comments =
new StringBuilder();
107 StringBuilder others =
new StringBuilder();
114 Source source =
new Source(stream);
115 source.fullSequentialParse();
116 Renderer renderer = source.getRenderer();
117 renderer.setNewLine(
"\n");
118 renderer.setIncludeHyperlinkURLs(
false);
119 renderer.setDecorateFontStyles(
false);
120 renderer.setIncludeAlternateText(
false);
122 String text = renderer.toString();
124 List<StartTag> tags = source.getAllStartTags();
126 StringBuilder stringBuilder =
new StringBuilder();
127 for (StartTag tag : tags) {
128 if (tag.getName().equals(
"script")) {
131 scripts.append(numScripts).append(
") ");
132 if (tag.getTagContent().length() > 0) {
133 scripts.append(tag.getTagContent()).append(
" ");
136 scripts.append(tag.getElement().getContent()).append(
"\n");
138 }
else if (tag.getName().equals(
"a")) {
141 links.append(numLinks).append(
") ");
142 links.append(tag.getTagContent()).append(
"\n");
144 }
else if (tag.getName().equals(
"img")) {
147 images.append(numImages).append(
") ");
148 images.append(tag.getTagContent()).append(
"\n");
150 }
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
152 comments.append(numComments).append(
") ");
153 comments.append(tag.getTagContent()).append(
"\n");
157 Attributes atts = tag.getAttributes();
158 if (atts != null && atts.length() > 0) {
160 others.append(numOthers).append(
") ");
161 others.append(tag.getName()).append(
":");
162 others.append(tag.getTagContent()).append(
"\n");
167 stringBuilder.append(text).append(
"\n\n");
168 stringBuilder.append(
"----------NONVISIBLE TEXT----------\n\n");
169 if (numScripts > 0) {
170 stringBuilder.append(
"---Scripts---\n");
171 stringBuilder.append(scripts).append(
"\n");
174 stringBuilder.append(
"---Links---\n");
175 stringBuilder.append(links).append(
"\n");
178 stringBuilder.append(
"---Images---\n");
179 stringBuilder.append(images).append(
"\n");
181 if (numComments > 0) {
182 stringBuilder.append(
"---Comments---\n");
183 stringBuilder.append(comments).append(
"\n");
186 stringBuilder.append(
"---Others---\n");
187 stringBuilder.append(others).append(
"\n");
190 return new StringReader(stringBuilder.toString());
191 }
catch (IOException ex) {
192 logger.log(Level.WARNING,
"Error extracting HTML from content.", ex);
193 throw new ExtractionException(
"Error extracting HTML from content.", ex);