19 package org.sleuthkit.autopsy.keywordsearch;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.io.StringReader;
25 import java.util.List;
26 import java.util.logging.Level;
28 import net.htmlparser.jericho.Attributes;
29 import net.htmlparser.jericho.Renderer;
30 import net.htmlparser.jericho.Source;
31 import net.htmlparser.jericho.StartTag;
32 import net.htmlparser.jericho.StartTagType;
39 class JerichoParserWrapper {
41 private static final Logger logger = Logger.
getLogger(JerichoParserWrapper.class.getName());
42 private InputStream in;
43 private StringBuilder out;
44 private Reader reader;
46 JerichoParserWrapper(InputStream in) {
56 public Reader getReader() {
65 out =
new StringBuilder();
68 Source source =
new Source(in);
69 source.fullSequentialParse();
72 StringBuilder scripts =
new StringBuilder();
73 StringBuilder links =
new StringBuilder();
74 StringBuilder images =
new StringBuilder();
75 StringBuilder comments =
new StringBuilder();
76 StringBuilder others =
new StringBuilder();
83 text = renderHTMLAsPlainText(source);
86 List<StartTag> tags = source.getAllStartTags();
87 for (StartTag tag : tags) {
88 if (tag.getName().equals(
"script")) {
90 scripts.append(numScripts).append(
") ");
91 if (tag.getTagContent().length() > 0) {
92 scripts.append(tag.getTagContent()).append(
" ");
95 scripts.append(tag.getElement().getContent()).append(
"\n");
97 }
else if (tag.getName().equals(
"a")) {
98 links.append(numLinks).append(
") ");
99 links.append(tag.getTagContent()).append(
"\n");
101 }
else if (tag.getName().equals(
"img")) {
102 images.append(numImages).append(
") ");
103 images.append(tag.getTagContent()).append(
"\n");
105 }
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
106 comments.append(numComments).append(
") ");
107 comments.append(tag.getTagContent()).append(
"\n");
111 Attributes atts = tag.getAttributes();
112 if (atts != null && atts.length() > 0) {
113 others.append(numOthers).append(
") ");
114 others.append(tag.getName()).append(
":");
115 others.append(tag.getTagContent()).append(
"\n");
121 out.append(text).append(
"\n\n");
123 out.append(
"----------NONVISIBLE TEXT----------\n\n");
124 if (numScripts > 1) {
125 out.append(
"---Scripts---\n");
126 out.append(scripts.toString()).append(
"\n");
129 out.append(
"---Links---\n");
130 out.append(links.toString()).append(
"\n");
133 out.append(
"---Images---\n");
134 out.append(images.toString()).append(
"\n");
136 if (numComments > 1) {
137 out.append(
"---Comments---\n");
138 out.append(comments.toString()).append(
"\n");
141 out.append(
"---Others---\n");
142 out.append(others.toString()).append(
"\n");
145 reader =
new StringReader(out.toString());
146 }
catch (IOException ex) {
147 logger.log(Level.WARNING,
"Unable to parse the HTML file", ex);
153 private String renderHTMLAsPlainText(Source source) {
154 Renderer renderer = source.getRenderer();
155 renderer.setNewLine(
"\n");
156 renderer.setIncludeHyperlinkURLs(
false);
157 renderer.setDecorateFontStyles(
false);
158 renderer.setIncludeAlternateText(
false);
159 return renderer.toString();
synchronized static Logger getLogger(String name)