Autopsy  4.12.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
JerichoParserWrapper.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.io.StringReader;
25 import java.util.List;
26 import java.util.logging.Level;
28 import net.htmlparser.jericho.Attributes;
29 import net.htmlparser.jericho.Renderer;
30 import net.htmlparser.jericho.Source;
31 import net.htmlparser.jericho.StartTag;
32 import net.htmlparser.jericho.StartTagType;
33 
39 class JerichoParserWrapper {
40 
41  private static final Logger logger = Logger.getLogger(JerichoParserWrapper.class.getName());
42  private InputStream in;
43  private StringBuilder out;
44  private Reader reader;
45 
46  JerichoParserWrapper(InputStream in) {
47  this.in = in;
48  }
49 
56  public Reader getReader() {
57  return reader;
58  }
59 
64  public void parse() {
65  out = new StringBuilder();
66 
67  try {
68  Source source = new Source(in);
69  source.fullSequentialParse();
70 
71  String text;
72  StringBuilder scripts = new StringBuilder();
73  StringBuilder links = new StringBuilder();
74  StringBuilder images = new StringBuilder();
75  StringBuilder comments = new StringBuilder();
76  StringBuilder others = new StringBuilder();
77  int numScripts = 1;
78  int numLinks = 1;
79  int numImages = 1;
80  int numComments = 1;
81  int numOthers = 1;
82 
83  text = renderHTMLAsPlainText(source);
84 
85  // Get all the tags in the source
86  List<StartTag> tags = source.getAllStartTags();
87  for (StartTag tag : tags) {
88  if (tag.getName().equals("script")) { //NON-NLS
89  // If the <script> tag has attributes
90  scripts.append(numScripts).append(") ");
91  if (tag.getTagContent().length() > 0) {
92  scripts.append(tag.getTagContent()).append(" ");
93  }
94  // Get whats between the <script> .. </script> tags
95  scripts.append(tag.getElement().getContent()).append("\n");
96  numScripts++;
97  } else if (tag.getName().equals("a")) { //NON-NLS
98  links.append(numLinks).append(") ");
99  links.append(tag.getTagContent()).append("\n");
100  numLinks++;
101  } else if (tag.getName().equals("img")) { //NON-NLS
102  images.append(numImages).append(") ");
103  images.append(tag.getTagContent()).append("\n");
104  numImages++;
105  } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
106  comments.append(numComments).append(") ");
107  comments.append(tag.getTagContent()).append("\n");
108  numComments++;
109  } else {
110  // Make sure it has an attribute
111  Attributes atts = tag.getAttributes();
112  if (atts != null && atts.length() > 0) {
113  others.append(numOthers).append(") ");
114  others.append(tag.getName()).append(":");
115  others.append(tag.getTagContent()).append("\n");
116  numOthers++;
117  }
118  }
119  }
120 
121  out.append(text).append("\n\n");
122 
123  out.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
124  if (numScripts > 1) {
125  out.append("---Scripts---\n"); //NON-NLS
126  out.append(scripts.toString()).append("\n");
127  }
128  if (numLinks > 1) {
129  out.append("---Links---\n"); //NON-NLS
130  out.append(links.toString()).append("\n");
131  }
132  if (numImages > 1) {
133  out.append("---Images---\n"); //NON-NLS
134  out.append(images.toString()).append("\n");
135  }
136  if (numComments > 1) {
137  out.append("---Comments---\n"); //NON-NLS
138  out.append(comments.toString()).append("\n");
139  }
140  if (numOthers > 1) {
141  out.append("---Others---\n"); //NON-NLS
142  out.append(others.toString()).append("\n");
143  }
144  // All done, now make it a reader
145  reader = new StringReader(out.toString());
146  } catch (IOException ex) {
147  logger.log(Level.WARNING, "Unable to parse the HTML file", ex); //NON-NLS
148  }
149  }
150 
151  // Extract text from the source, nicely formatted with whitespace and
152  // newlines where appropriate.
153  private String renderHTMLAsPlainText(Source source) {
154  Renderer renderer = source.getRenderer();
155  renderer.setNewLine("\n");
156  renderer.setIncludeHyperlinkURLs(false);
157  renderer.setDecorateFontStyles(false);
158  renderer.setIncludeAlternateText(false);
159  return renderer.toString();
160  }
161 }
synchronized static Logger getLogger(String name)
Definition: Logger.java:124

Copyright © 2012-2018 Basis Technology. Generated on: Wed Sep 18 2019
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.