Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
HtmlTextExtractor.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2016 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.logging.Level;
27 import net.htmlparser.jericho.Attributes;
28 import net.htmlparser.jericho.Renderer;
29 import net.htmlparser.jericho.Source;
30 import net.htmlparser.jericho.StartTag;
31 import net.htmlparser.jericho.StartTagType;
35 
39 class HtmlTextExtractor extends FileTextExtractor {
40 
41  static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
42  private static final int MAX_SIZE = 50_000_000; //50MB
43 
44  static final List<String> WEB_MIME_TYPES = Arrays.asList(
45  "application/javascript", //NON-NLS
46  "application/xhtml+xml", //NON-NLS
47  "application/json", //NON-NLS
48  "text/css", //NON-NLS
49  "text/html", //NON-NLS NON-NLS
50  "text/javascript" //NON-NLS
51  );
52 
53  @Override
54  boolean isContentTypeSpecific() {
55  return true;
56  }
57 
58  @Override
59  boolean isSupported(AbstractFile file, String detectedFormat) {
60  return detectedFormat != null
61  && WEB_MIME_TYPES.contains(detectedFormat)
62  && file.getSize() <= MAX_SIZE;
63  }
64 
65  @Override
66  public Reader getReader(AbstractFile sourceFile) throws TextExtractorException {
67  ReadContentInputStream stream = new ReadContentInputStream(sourceFile);
68 
69  //Parse the stream with Jericho and put the results in a Reader
70  try {
71  StringBuilder scripts = new StringBuilder();
72  StringBuilder links = new StringBuilder();
73  StringBuilder images = new StringBuilder();
74  StringBuilder comments = new StringBuilder();
75  StringBuilder others = new StringBuilder();
76  int numScripts = 0;
77  int numLinks = 0;
78  int numImages = 0;
79  int numComments = 0;
80  int numOthers = 0;
81 
82  Source source = new Source(stream);
83  source.fullSequentialParse();
84  Renderer renderer = source.getRenderer();
85  renderer.setNewLine("\n");
86  renderer.setIncludeHyperlinkURLs(false);
87  renderer.setDecorateFontStyles(false);
88  renderer.setIncludeAlternateText(false);
89 
90  String text = renderer.toString();
91  // Get all the tags in the source
92  List<StartTag> tags = source.getAllStartTags();
93 
94  StringBuilder stringBuilder = new StringBuilder();
95  for (StartTag tag : tags) {
96  if (tag.getName().equals("script")) { //NON-NLS
97  // If the <script> tag has attributes
98  numScripts++;
99  scripts.append(numScripts).append(") ");
100  if (tag.getTagContent().length() > 0) {
101  scripts.append(tag.getTagContent()).append(" ");
102  }
103  // Get whats between the <script> .. </script> tags
104  scripts.append(tag.getElement().getContent()).append("\n");
105 
106  } else if (tag.getName().equals("a")) {
107  //NON-NLS
108  numLinks++;
109  links.append(numLinks).append(") ");
110  links.append(tag.getTagContent()).append("\n");
111 
112  } else if (tag.getName().equals("img")) {
113  //NON-NLS
114  numImages++;
115  images.append(numImages).append(") ");
116  images.append(tag.getTagContent()).append("\n");
117 
118  } else if (tag.getTagType().equals(StartTagType.COMMENT)) {
119  numComments++;
120  comments.append(numComments).append(") ");
121  comments.append(tag.getTagContent()).append("\n");
122 
123  } else {
124  // Make sure it has an attribute
125  Attributes atts = tag.getAttributes();
126  if (atts != null && atts.length() > 0) {
127  numOthers++;
128  others.append(numOthers).append(") ");
129  others.append(tag.getName()).append(":");
130  others.append(tag.getTagContent()).append("\n");
131 
132  }
133  }
134  }
135  stringBuilder.append(text).append("\n\n");
136  stringBuilder.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
137  if (numScripts > 0) {
138  stringBuilder.append("---Scripts---\n"); //NON-NLS
139  stringBuilder.append(scripts).append("\n");
140  }
141  if (numLinks > 0) {
142  stringBuilder.append("---Links---\n"); //NON-NLS
143  stringBuilder.append(links).append("\n");
144  }
145  if (numImages > 0) {
146  stringBuilder.append("---Images---\n"); //NON-NLS
147  stringBuilder.append(images).append("\n");
148  }
149  if (numComments > 0) {
150  stringBuilder.append("---Comments---\n"); //NON-NLS
151  stringBuilder.append(comments).append("\n");
152  }
153  if (numOthers > 0) {
154  stringBuilder.append("---Others---\n"); //NON-NLS
155  stringBuilder.append(others).append("\n");
156  }
157  // All done, now make it a reader
158  return new StringReader(stringBuilder.toString());
159  } catch (IOException ex) {
160  throw new TextExtractorException("Error extracting HTML from content.", ex);
161  }
162  }
163 
164  @Override
165  public boolean isDisabled() {
166  return false;
167  }
168 
169  public void logWarning(final String msg, Exception ex) {
170  logger.log(Level.WARNING, msg, ex); //NON-NLS }
171  }
172 }

Copyright © 2012-2016 Basis Technology. Generated on: Mon Apr 24 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.