Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
JerichoParserWrapper.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.io.StringReader;
25 import java.util.List;
26 import java.util.logging.Level;
28 import net.htmlparser.jericho.Attributes;
29 import net.htmlparser.jericho.Renderer;
30 import net.htmlparser.jericho.Source;
31 import net.htmlparser.jericho.StartTag;
32 import net.htmlparser.jericho.StartTagType;
33 
39 class JerichoParserWrapper {
40  private static final Logger logger = Logger.getLogger(JerichoParserWrapper.class.getName());
41  private InputStream in;
42  private StringBuilder out;
43  private Reader reader;
44 
45  JerichoParserWrapper(InputStream in) {
46  this.in = in;
47  }
48 
54  public Reader getReader() {
55  return reader;
56  }
57 
62  public void parse() {
63  out = new StringBuilder();
64 
65  try {
66  Source source = new Source(in);
67  source.fullSequentialParse();
68 
69  String text;
70  StringBuilder scripts = new StringBuilder();
71  StringBuilder links = new StringBuilder();
72  StringBuilder images = new StringBuilder();
73  StringBuilder comments = new StringBuilder();
74  StringBuilder others = new StringBuilder();
75  int numScripts = 1;
76  int numLinks = 1;
77  int numImages = 1;
78  int numComments = 1;
79  int numOthers = 1;
80 
81  text = renderHTMLAsPlainText(source);
82 
83  // Get all the tags in the source
84  List<StartTag> tags = source.getAllStartTags();
85  for(StartTag tag : tags) {
86  if(tag.getName().equals("script")) { //NON-NLS
87  // If the <script> tag has attributes
88  scripts.append(numScripts).append(") ");
89  if(tag.getTagContent().length()>0) {
90  scripts.append(tag.getTagContent()).append(" ");
91  }
92  // Get whats between the <script> .. </script> tags
93  scripts.append(tag.getElement().getContent()).append("\n");
94  numScripts++;
95  } else if(tag.getName().equals("a")) { //NON-NLS
96  links.append(numLinks).append(") ");
97  links.append(tag.getTagContent()).append("\n");
98  numLinks++;
99  } else if(tag.getName().equals("img")) { //NON-NLS
100  images.append(numImages).append(") ");
101  images.append(tag.getTagContent()).append("\n");
102  numImages++;
103  } else if(tag.getTagType().equals(StartTagType.COMMENT)) {
104  comments.append(numComments).append(") ");
105  comments.append(tag.getTagContent()).append("\n");
106  numComments++;
107  } else {
108  // Make sure it has an attribute
109  Attributes atts = tag.getAttributes();
110  if (atts!=null && atts.length()>0) {
111  others.append(numOthers).append(") ");
112  others.append(tag.getName()).append(":");
113  others.append(tag.getTagContent()).append("\n");
114  numOthers++;
115  }
116  }
117  }
118 
119  out.append(text).append("\n\n");
120 
121  out.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
122  if(numScripts>1) {
123  out.append("---Scripts---\n"); //NON-NLS
124  out.append(scripts.toString()).append("\n");
125  } if(numLinks>1) {
126  out.append("---Links---\n"); //NON-NLS
127  out.append(links.toString()).append("\n");
128  } if(numImages>1) {
129  out.append("---Images---\n"); //NON-NLS
130  out.append(images.toString()).append("\n");
131  } if(numComments>1) {
132  out.append("---Comments---\n"); //NON-NLS
133  out.append(comments.toString()).append("\n");
134  } if(numOthers>1) {
135  out.append("---Others---\n"); //NON-NLS
136  out.append(others.toString()).append("\n");
137  }
138  // All done, now make it a reader
139  reader = new StringReader(out.toString());
140  } catch (IOException ex) {
141  logger.log(Level.WARNING, "Unable to parse the HTML file", ex); //NON-NLS
142  }
143  }
144 
145  // Extract text from the source, nicely formatted with whitespace and
146  // newlines where appropriate.
147  private String renderHTMLAsPlainText(Source source) {
148  Renderer renderer = source.getRenderer();
149  renderer.setNewLine("\n");
150  renderer.setIncludeHyperlinkURLs(false);
151  renderer.setDecorateFontStyles(false);
152  renderer.setIncludeAlternateText(false);
153  return renderer.toString();
154  }
155 }
static Logger getLogger(String name)
Definition: Logger.java:131

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.