19 package org.sleuthkit.autopsy.textextractors;
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.HashMap;
26 import java.util.List;
28 import java.util.logging.Level;
29 import net.htmlparser.jericho.Attributes;
30 import net.htmlparser.jericho.Config;
31 import net.htmlparser.jericho.LoggerProvider;
32 import net.htmlparser.jericho.Renderer;
33 import net.htmlparser.jericho.Source;
34 import net.htmlparser.jericho.StartTag;
35 import net.htmlparser.jericho.StartTagType;
43 final class HtmlTextExtractor
implements TextExtractor {
45 static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
46 private final int MAX_SIZE;
47 private final AbstractFile file;
49 static final List<String> WEB_MIME_TYPES = Arrays.asList(
50 "application/javascript",
51 "application/xhtml+xml",
60 Config.LoggerProvider = LoggerProvider.DISABLED;
67 public HtmlTextExtractor(AbstractFile file) {
69 MAX_SIZE = 50_000_000;
82 public boolean isSupported() {
83 return file.getMIMEType() != null
84 && WEB_MIME_TYPES.contains(file.getMIMEType())
85 && file.getSize() <= MAX_SIZE;
95 public Map<String, String> getMetadata() {
96 Map<String, String> metadataMap =
new HashMap<>();
98 ReadContentInputStream stream =
new ReadContentInputStream(file);
99 StringBuilder scripts =
new StringBuilder(
"\n");
100 StringBuilder links =
new StringBuilder(
"\n");
101 StringBuilder images =
new StringBuilder(
"\n");
102 StringBuilder comments =
new StringBuilder(
"\n");
103 StringBuilder others =
new StringBuilder(
"\n");
110 Source source =
new Source(stream);
111 source.fullSequentialParse();
113 List<StartTag> tags = source.getAllStartTags();
114 for (StartTag tag : tags) {
115 if (tag.getName().equals(
"script")) {
118 scripts.append(numScripts).append(
") ");
119 if (tag.getTagContent().length() > 0) {
120 scripts.append(tag.getTagContent()).append(
" ");
123 scripts.append(tag.getElement().getContent()).append(
"\n");
125 }
else if (tag.getName().equals(
"a")) {
128 links.append(numLinks).append(
") ");
129 links.append(tag.getTagContent()).append(
"\n");
131 }
else if (tag.getName().equals(
"img")) {
134 images.append(numImages).append(
") ");
135 images.append(tag.getTagContent()).append(
"\n");
137 }
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
139 comments.append(numComments).append(
") ");
140 comments.append(tag.getTagContent()).append(
"\n");
144 Attributes atts = tag.getAttributes();
145 if (atts != null && atts.length() > 0) {
147 others.append(numOthers).append(
") ");
148 others.append(tag.getName()).append(
":");
149 others.append(tag.getTagContent()).append(
"\n");
155 if (numScripts > 0) {
156 metadataMap.put(
"Scripts", scripts.toString());
159 metadataMap.put(
"Links", links.toString());
162 metadataMap.put(
"Images", images.toString());
164 if (numComments > 0) {
165 metadataMap.put(
"Comments", comments.toString());
168 metadataMap.put(
"Others", others.toString());
170 }
catch (IOException ex) {
171 logger.log(Level.WARNING,
"Error extracting HTML metadata from content.", ex);
187 public Reader getReader() throws InitReaderException {
190 ReadContentInputStream stream =
new ReadContentInputStream(file);
194 Source source =
new Source(stream);
195 source.fullSequentialParse();
196 Renderer renderer = source.getRenderer();
197 renderer.setNewLine(
"\n");
198 renderer.setIncludeHyperlinkURLs(
false);
199 renderer.setDecorateFontStyles(
false);
200 renderer.setIncludeAlternateText(
false);
201 renderer.setMaxLineLength(0);
202 return new StringReader(renderer.toString());
203 }
catch (Throwable ex) {
206 logger.log(Level.WARNING,
"Error extracting HTML from content.", ex);
207 throw new InitReaderException(
"Error extracting HTML from content.", ex);