19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.logging.Level;
27 import net.htmlparser.jericho.Attributes;
28 import net.htmlparser.jericho.Config;
29 import net.htmlparser.jericho.LoggerProvider;
30 import net.htmlparser.jericho.Renderer;
31 import net.htmlparser.jericho.Source;
32 import net.htmlparser.jericho.StartTag;
33 import net.htmlparser.jericho.StartTagType;
41 class HtmlTextExtractor
extends ContentTextExtractor {
43 static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
44 private static final int MAX_SIZE = 50_000_000;
46 static final List<String> WEB_MIME_TYPES = Arrays.asList(
47 "application/javascript",
48 "application/xhtml+xml",
57 Config.LoggerProvider = LoggerProvider.DISABLED;
61 boolean isContentTypeSpecific() {
66 boolean isSupported(Content content, String detectedFormat) {
67 return detectedFormat != null
68 && WEB_MIME_TYPES.contains(detectedFormat)
69 && content.getSize() <= MAX_SIZE;
73 public Reader getReader(Content content)
throws TextExtractorException {
74 ReadContentInputStream stream =
new ReadContentInputStream(content);
78 StringBuilder scripts =
new StringBuilder();
79 StringBuilder links =
new StringBuilder();
80 StringBuilder images =
new StringBuilder();
81 StringBuilder comments =
new StringBuilder();
82 StringBuilder others =
new StringBuilder();
89 Source source =
new Source(stream);
90 source.fullSequentialParse();
91 Renderer renderer = source.getRenderer();
92 renderer.setNewLine(
"\n");
93 renderer.setIncludeHyperlinkURLs(
false);
94 renderer.setDecorateFontStyles(
false);
95 renderer.setIncludeAlternateText(
false);
97 String text = renderer.toString();
99 List<StartTag> tags = source.getAllStartTags();
101 StringBuilder stringBuilder =
new StringBuilder();
102 for (StartTag tag : tags) {
103 if (tag.getName().equals(
"script")) {
106 scripts.append(numScripts).append(
") ");
107 if (tag.getTagContent().length() > 0) {
108 scripts.append(tag.getTagContent()).append(
" ");
111 scripts.append(tag.getElement().getContent()).append(
"\n");
113 }
else if (tag.getName().equals(
"a")) {
116 links.append(numLinks).append(
") ");
117 links.append(tag.getTagContent()).append(
"\n");
119 }
else if (tag.getName().equals(
"img")) {
122 images.append(numImages).append(
") ");
123 images.append(tag.getTagContent()).append(
"\n");
125 }
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
127 comments.append(numComments).append(
") ");
128 comments.append(tag.getTagContent()).append(
"\n");
132 Attributes atts = tag.getAttributes();
133 if (atts != null && atts.length() > 0) {
135 others.append(numOthers).append(
") ");
136 others.append(tag.getName()).append(
":");
137 others.append(tag.getTagContent()).append(
"\n");
142 stringBuilder.append(text).append(
"\n\n");
143 stringBuilder.append(
"----------NONVISIBLE TEXT----------\n\n");
144 if (numScripts > 0) {
145 stringBuilder.append(
"---Scripts---\n");
146 stringBuilder.append(scripts).append(
"\n");
149 stringBuilder.append(
"---Links---\n");
150 stringBuilder.append(links).append(
"\n");
153 stringBuilder.append(
"---Images---\n");
154 stringBuilder.append(images).append(
"\n");
156 if (numComments > 0) {
157 stringBuilder.append(
"---Comments---\n");
158 stringBuilder.append(comments).append(
"\n");
161 stringBuilder.append(
"---Others---\n");
162 stringBuilder.append(others).append(
"\n");
165 return new StringReader(stringBuilder.toString());
166 }
catch (IOException ex) {
167 throw new TextExtractorException(
"Error extracting HTML from content.", ex);
172 public boolean isDisabled() {
177 public void logWarning(
final String msg, Exception ex) {
178 logger.log(Level.WARNING, msg, ex);