19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.io.StringReader;
25 import java.util.List;
26 import java.util.logging.Level;
28 import net.htmlparser.jericho.Attributes;
29 import net.htmlparser.jericho.Renderer;
30 import net.htmlparser.jericho.Source;
31 import net.htmlparser.jericho.StartTag;
32 import net.htmlparser.jericho.StartTagType;
39 class JerichoParserWrapper {
40 private static final Logger logger = Logger.
getLogger(JerichoParserWrapper.class.getName());
41 private InputStream in;
42 private StringBuilder out;
43 private Reader reader;
45 JerichoParserWrapper(InputStream in) {
54 public Reader getReader() {
63 out =
new StringBuilder();
66 Source source =
new Source(in);
67 source.fullSequentialParse();
70 StringBuilder scripts =
new StringBuilder();
71 StringBuilder links =
new StringBuilder();
72 StringBuilder images =
new StringBuilder();
73 StringBuilder comments =
new StringBuilder();
74 StringBuilder others =
new StringBuilder();
81 text = renderHTMLAsPlainText(source);
84 List<StartTag> tags = source.getAllStartTags();
85 for(StartTag tag : tags) {
86 if(tag.getName().equals(
"script")) {
88 scripts.append(numScripts).append(
") ");
89 if(tag.getTagContent().length()>0) {
90 scripts.append(tag.getTagContent()).append(
" ");
93 scripts.append(tag.getElement().getContent()).append(
"\n");
95 }
else if(tag.getName().equals(
"a")) {
96 links.append(numLinks).append(
") ");
97 links.append(tag.getTagContent()).append(
"\n");
99 }
else if(tag.getName().equals(
"img")) {
100 images.append(numImages).append(
") ");
101 images.append(tag.getTagContent()).append(
"\n");
103 }
else if(tag.getTagType().equals(StartTagType.COMMENT)) {
104 comments.append(numComments).append(
") ");
105 comments.append(tag.getTagContent()).append(
"\n");
109 Attributes atts = tag.getAttributes();
110 if (atts!=null && atts.length()>0) {
111 others.append(numOthers).append(
") ");
112 others.append(tag.getName()).append(
":");
113 others.append(tag.getTagContent()).append(
"\n");
119 out.append(text).append(
"\n\n");
121 out.append(
"----------NONVISIBLE TEXT----------\n\n");
123 out.append(
"---Scripts---\n");
124 out.append(scripts.toString()).append(
"\n");
126 out.append(
"---Links---\n");
127 out.append(links.toString()).append(
"\n");
129 out.append(
"---Images---\n");
130 out.append(images.toString()).append(
"\n");
131 }
if(numComments>1) {
132 out.append(
"---Comments---\n");
133 out.append(comments.toString()).append(
"\n");
135 out.append(
"---Others---\n");
136 out.append(others.toString()).append(
"\n");
139 reader =
new StringReader(out.toString());
140 }
catch (IOException ex) {
141 logger.log(Level.WARNING,
"Unable to parse the HTML file", ex);
147 private String renderHTMLAsPlainText(Source source) {
148 Renderer renderer = source.getRenderer();
149 renderer.setNewLine(
"\n");
150 renderer.setIncludeHyperlinkURLs(
false);
151 renderer.setDecorateFontStyles(
false);
152 renderer.setIncludeAlternateText(
false);
153 return renderer.toString();
static Logger getLogger(String name)