Autopsy  4.6.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
HtmlTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.logging.Level;
27 import net.htmlparser.jericho.Attributes;
28 import net.htmlparser.jericho.Config;
29 import net.htmlparser.jericho.LoggerProvider;
30 import net.htmlparser.jericho.Renderer;
31 import net.htmlparser.jericho.Source;
32 import net.htmlparser.jericho.StartTag;
33 import net.htmlparser.jericho.StartTagType;
35 import org.sleuthkit.datamodel.Content;
36 import org.sleuthkit.datamodel.ReadContentInputStream;
37 
41 class HtmlTextExtractor extends ContentTextExtractor {
42 
43  static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
44  private static final int MAX_SIZE = 50_000_000; //50MB
45 
46  static final List<String> WEB_MIME_TYPES = Arrays.asList(
47  "application/javascript", //NON-NLS
48  "application/xhtml+xml", //NON-NLS
49  "application/json", //NON-NLS
50  "text/css", //NON-NLS
51  "text/html", //NON-NLS NON-NLS
52  "text/javascript" //NON-NLS
53  );
54 
55  static {
56  // Disable Jericho HTML Parser log messages.
57  Config.LoggerProvider = LoggerProvider.DISABLED;
58  }
59 
60  @Override
61  boolean isContentTypeSpecific() {
62  return true;
63  }
64 
65  @Override
66  boolean isSupported(Content content, String detectedFormat) {
67  return detectedFormat != null
68  && WEB_MIME_TYPES.contains(detectedFormat)
69  && content.getSize() <= MAX_SIZE;
70  }
71 
72  @Override
73  public Reader getReader(Content content) throws TextExtractorException {
74  ReadContentInputStream stream = new ReadContentInputStream(content);
75 
76  //Parse the stream with Jericho and put the results in a Reader
77  try {
78  StringBuilder scripts = new StringBuilder();
79  StringBuilder links = new StringBuilder();
80  StringBuilder images = new StringBuilder();
81  StringBuilder comments = new StringBuilder();
82  StringBuilder others = new StringBuilder();
83  int numScripts = 0;
84  int numLinks = 0;
85  int numImages = 0;
86  int numComments = 0;
87  int numOthers = 0;
88 
89  Source source = new Source(stream);
90  source.fullSequentialParse();
91  Renderer renderer = source.getRenderer();
92  renderer.setNewLine("\n");
93  renderer.setIncludeHyperlinkURLs(false);
94  renderer.setDecorateFontStyles(false);
95  renderer.setIncludeAlternateText(false);
96 
97  String text = renderer.toString();
98  // Get all the tags in the source
99  List<StartTag> tags = source.getAllStartTags();
100 
101  StringBuilder stringBuilder = new StringBuilder();
102  for (StartTag tag : tags) {
103  if (tag.getName().equals("script")) { //NON-NLS
104  // If the <script> tag has attributes
105  numScripts++;
106  scripts.append(numScripts).append(") ");
107  if (tag.getTagContent().length() > 0) {
108  scripts.append(tag.getTagContent()).append(" ");
109  }
110  // Get whats between the <script> .. </script> tags
111  scripts.append(tag.getElement().getContent()).append("\n");
112 
113  } else if (tag.getName().equals("a")) {
114  //NON-NLS
115  numLinks++;
116  links.append(numLinks).append(") ");
117  links.append(tag.getTagContent()).append("\n");
118 
119  } else if (tag.getName().equals("img")) {
120  //NON-NLS
121  numImages++;
122  images.append(numImages).append(") ");
123  images.append(tag.getTagContent()).append("\n");
124 
125  } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
126  numComments++;
127  comments.append(numComments).append(") ");
128  comments.append(tag.getTagContent()).append("\n");
129 
130  } else {
131  // Make sure it has an attribute
132  Attributes atts = tag.getAttributes();
133  if (atts != null && atts.length() > 0) {
134  numOthers++;
135  others.append(numOthers).append(") ");
136  others.append(tag.getName()).append(":");
137  others.append(tag.getTagContent()).append("\n");
138 
139  }
140  }
141  }
142  stringBuilder.append(text).append("\n\n");
143  stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
144  if (numScripts > 0) {
145  stringBuilder.append("---Scripts---\n"); //NON-NLS
146  stringBuilder.append(scripts).append("\n");
147  }
148  if (numLinks > 0) {
149  stringBuilder.append("---Links---\n"); //NON-NLS
150  stringBuilder.append(links).append("\n");
151  }
152  if (numImages > 0) {
153  stringBuilder.append("---Images---\n"); //NON-NLS
154  stringBuilder.append(images).append("\n");
155  }
156  if (numComments > 0) {
157  stringBuilder.append("---Comments---\n"); //NON-NLS
158  stringBuilder.append(comments).append("\n");
159  }
160  if (numOthers > 0) {
161  stringBuilder.append("---Others---\n"); //NON-NLS
162  stringBuilder.append(others).append("\n");
163  }
164  // All done, now make it a reader
165  return new StringReader(stringBuilder.toString());
166  } catch (IOException ex) {
167  throw new TextExtractorException("Error extracting HTML from content.", ex);
168  }
169  }
170 
171  @Override
172  public boolean isDisabled() {
173  return false;
174  }
175 
176  @Override
177  public void logWarning(final String msg, Exception ex) {
178  logger.log(Level.WARNING, msg, ex); //NON-NLS }
179  }
180 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon May 7 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.