19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.logging.Level;
27 import net.htmlparser.jericho.Attributes;
28 import net.htmlparser.jericho.Renderer;
29 import net.htmlparser.jericho.Source;
30 import net.htmlparser.jericho.StartTag;
31 import net.htmlparser.jericho.StartTagType;
39 class HtmlTextExtractor
extends FileTextExtractor {
41 static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
42 private static final int MAX_SIZE = 50_000_000;
44 static final List<String> WEB_MIME_TYPES = Arrays.asList(
45 "application/javascript",
46 "application/xhtml+xml",
54 boolean isContentTypeSpecific() {
59 boolean isSupported(AbstractFile file, String detectedFormat) {
60 return detectedFormat != null
61 && WEB_MIME_TYPES.contains(detectedFormat)
62 && file.getSize() <= MAX_SIZE;
66 public Reader getReader(AbstractFile sourceFile)
throws TextExtractorException {
67 ReadContentInputStream stream =
new ReadContentInputStream(sourceFile);
71 StringBuilder scripts =
new StringBuilder();
72 StringBuilder links =
new StringBuilder();
73 StringBuilder images =
new StringBuilder();
74 StringBuilder comments =
new StringBuilder();
75 StringBuilder others =
new StringBuilder();
82 Source source =
new Source(stream);
83 source.fullSequentialParse();
84 Renderer renderer = source.getRenderer();
85 renderer.setNewLine(
"\n");
86 renderer.setIncludeHyperlinkURLs(
false);
87 renderer.setDecorateFontStyles(
false);
88 renderer.setIncludeAlternateText(
false);
90 String text = renderer.toString();
92 List<StartTag> tags = source.getAllStartTags();
94 StringBuilder stringBuilder =
new StringBuilder();
95 for (StartTag tag : tags) {
96 if (tag.getName().equals(
"script")) {
99 scripts.append(numScripts).append(
") ");
100 if (tag.getTagContent().length() > 0) {
101 scripts.append(tag.getTagContent()).append(
" ");
104 scripts.append(tag.getElement().getContent()).append(
"\n");
106 }
else if (tag.getName().equals(
"a")) {
109 links.append(numLinks).append(
") ");
110 links.append(tag.getTagContent()).append(
"\n");
112 }
else if (tag.getName().equals(
"img")) {
115 images.append(numImages).append(
") ");
116 images.append(tag.getTagContent()).append(
"\n");
118 }
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
120 comments.append(numComments).append(
") ");
121 comments.append(tag.getTagContent()).append(
"\n");
125 Attributes atts = tag.getAttributes();
126 if (atts != null && atts.length() > 0) {
128 others.append(numOthers).append(
") ");
129 others.append(tag.getName()).append(
":");
130 others.append(tag.getTagContent()).append(
"\n");
135 stringBuilder.append(text).append(
"\n\n");
136 stringBuilder.append(
"----------NONVISIBLE TEXT----------\n\n");
137 if (numScripts > 0) {
138 stringBuilder.append(
"---Scripts---\n");
139 stringBuilder.append(scripts).append(
"\n");
142 stringBuilder.append(
"---Links---\n");
143 stringBuilder.append(links).append(
"\n");
146 stringBuilder.append(
"---Images---\n");
147 stringBuilder.append(images).append(
"\n");
149 if (numComments > 0) {
150 stringBuilder.append(
"---Comments---\n");
151 stringBuilder.append(comments).append(
"\n");
154 stringBuilder.append(
"---Others---\n");
155 stringBuilder.append(others).append(
"\n");
158 return new StringReader(stringBuilder.toString());
159 }
catch (IOException ex) {
160 throw new TextExtractorException(
"Error extracting HTML from content.", ex);
165 public boolean isDisabled() {
169 public void logWarning(
final String msg, Exception ex) {
170 logger.log(Level.WARNING, msg, ex);