Autopsy  4.17.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
HtmlTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.textextractors;
20 
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
29 import net.htmlparser.jericho.Attributes;
30 import net.htmlparser.jericho.Config;
31 import net.htmlparser.jericho.LoggerProvider;
32 import net.htmlparser.jericho.Renderer;
33 import net.htmlparser.jericho.Source;
34 import net.htmlparser.jericho.StartTag;
35 import net.htmlparser.jericho.StartTagType;
37 import org.sleuthkit.datamodel.AbstractFile;
38 import org.sleuthkit.datamodel.ReadContentInputStream;
39 
43 final class HtmlTextExtractor implements TextExtractor {
44 
45  static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
46  private final int MAX_SIZE;
47  private final AbstractFile file;
48 
49  static final List<String> WEB_MIME_TYPES = Arrays.asList(
50  "application/javascript", //NON-NLS
51  "application/xhtml+xml", //NON-NLS
52  "application/json", //NON-NLS
53  "text/css", //NON-NLS
54  "text/html", //NON-NLS NON-NLS
55  "text/javascript" //NON-NLS
56  );
57 
58  static {
59  // Disable Jericho HTML Parser log messages.
60  Config.LoggerProvider = LoggerProvider.DISABLED;
61  }
62 
67  public HtmlTextExtractor(AbstractFile file) {
68  //Set default to be 50 MB.
69  MAX_SIZE = 50_000_000;
70  this.file = file;
71  }
72 
81  @Override
82  public boolean isSupported() {
83  return file.getMIMEType() != null
84  && WEB_MIME_TYPES.contains(file.getMIMEType())
85  && file.getSize() <= MAX_SIZE;
86  }
87 
94  @Override
95  public Map<String, String> getMetadata() {
96  Map<String, String> metadataMap = new HashMap<>();
97  try {
98  ReadContentInputStream stream = new ReadContentInputStream(file);
99  StringBuilder scripts = new StringBuilder("\n");
100  StringBuilder links = new StringBuilder("\n");
101  StringBuilder images = new StringBuilder("\n");
102  StringBuilder comments = new StringBuilder("\n");
103  StringBuilder others = new StringBuilder("\n");
104  int numScripts = 0;
105  int numLinks = 0;
106  int numImages = 0;
107  int numComments = 0;
108  int numOthers = 0;
109 
110  Source source = new Source(stream);
111  source.fullSequentialParse();
112 
113  List<StartTag> tags = source.getAllStartTags();
114  for (StartTag tag : tags) {
115  if (tag.getName().equals("script")) { //NON-NLS
116  // If the <script> tag has attributes
117  numScripts++;
118  scripts.append(numScripts).append(") ");
119  if (tag.getTagContent().length() > 0) {
120  scripts.append(tag.getTagContent()).append(" ");
121  }
122  // Get whats between the <script> .. </script> tags
123  scripts.append(tag.getElement().getContent()).append("\n");
124 
125  } else if (tag.getName().equals("a")) {
126  //NON-NLS
127  numLinks++;
128  links.append(numLinks).append(") ");
129  links.append(tag.getTagContent()).append("\n");
130 
131  } else if (tag.getName().equals("img")) {
132  //NON-NLS
133  numImages++;
134  images.append(numImages).append(") ");
135  images.append(tag.getTagContent()).append("\n");
136 
137  } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
138  numComments++;
139  comments.append(numComments).append(") ");
140  comments.append(tag.getTagContent()).append("\n");
141 
142  } else {
143  // Make sure it has an attribute
144  Attributes atts = tag.getAttributes();
145  if (atts != null && atts.length() > 0) {
146  numOthers++;
147  others.append(numOthers).append(") ");
148  others.append(tag.getName()).append(":");
149  others.append(tag.getTagContent()).append("\n");
150 
151  }
152  }
153  }
154 
155  if (numScripts > 0) {
156  metadataMap.put("Scripts", scripts.toString());
157  }
158  if (numLinks > 0) {
159  metadataMap.put("Links", links.toString());
160  }
161  if (numImages > 0) {
162  metadataMap.put("Images", images.toString());
163  }
164  if (numComments > 0) {
165  metadataMap.put("Comments", comments.toString());
166  }
167  if (numOthers > 0) {
168  metadataMap.put("Others", others.toString());
169  }
170  } catch (IOException ex) {
171  logger.log(Level.WARNING, "Error extracting HTML metadata from content.", ex);
172  }
173 
174  return metadataMap;
175  }
176 
186  @Override
187  public Reader getReader() throws InitReaderException {
188  //TODO JIRA-4467, there is only harm in excluding HTML documents greater
189  //than 50MB due to our troubled approach of extraction.
190  ReadContentInputStream stream = new ReadContentInputStream(file);
191 
192  //Parse the stream with Jericho and put the results in a Reader
193  try {
194  Source source = new Source(stream);
195  source.fullSequentialParse();
196  Renderer renderer = source.getRenderer();
197  renderer.setNewLine("\n");
198  renderer.setIncludeHyperlinkURLs(false);
199  renderer.setDecorateFontStyles(false);
200  renderer.setIncludeAlternateText(false);
201  renderer.setMaxLineLength(0); // don't force wrapping
202  return new StringReader(renderer.toString());
203  } catch (IOException ex) {
204  logger.log(Level.WARNING, "Error extracting HTML from content.", ex);
205  throw new InitReaderException("Error extracting HTML from content.", ex);
206  }
207  }
208 }

Copyright © 2012-2021 Basis Technology. Generated on: Tue Jan 19 2021
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.