19 package org.sleuthkit.autopsy.textextractors;
 
   21 import java.io.IOException;
 
   22 import java.io.Reader;
 
   23 import java.io.StringReader;
 
   24 import java.util.Arrays;
 
   25 import java.util.HashMap;
 
   26 import java.util.List;
 
   28 import java.util.logging.Level;
 
   29 import net.htmlparser.jericho.Attributes;
 
   30 import net.htmlparser.jericho.Config;
 
   31 import net.htmlparser.jericho.LoggerProvider;
 
   32 import net.htmlparser.jericho.Renderer;
 
   33 import net.htmlparser.jericho.Source;
 
   34 import net.htmlparser.jericho.StartTag;
 
   35 import net.htmlparser.jericho.StartTagType;
 
   43 final class HtmlTextExtractor 
implements TextExtractor {
 
   45     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
 
   46     private final int MAX_SIZE;
 
   47     private final AbstractFile file;
 
   49     static final List<String> WEB_MIME_TYPES = Arrays.asList(
 
   50             "application/javascript", 
 
   51             "application/xhtml+xml", 
 
   60         Config.LoggerProvider = LoggerProvider.DISABLED;
 
   67     public HtmlTextExtractor(AbstractFile file) {
 
   69         MAX_SIZE = 50_000_000;
 
   82     public boolean isSupported() {
 
   83         return file.getMIMEType() != null
 
   84                 && WEB_MIME_TYPES.contains(file.getMIMEType())
 
   85                 && file.getSize() <= MAX_SIZE;
 
   95     public Map<String, String> getMetadata() {
 
   96         Map<String, String> metadataMap = 
new HashMap<>();
 
   98             ReadContentInputStream stream = 
new ReadContentInputStream(file);
 
   99             StringBuilder scripts = 
new StringBuilder(
"\n");
 
  100             StringBuilder links = 
new StringBuilder(
"\n");
 
  101             StringBuilder images = 
new StringBuilder(
"\n");
 
  102             StringBuilder comments = 
new StringBuilder(
"\n");
 
  103             StringBuilder others = 
new StringBuilder(
"\n");
 
  110             Source source = 
new Source(stream);
 
  111             source.fullSequentialParse();
 
  113             List<StartTag> tags = source.getAllStartTags();
 
  114             for (StartTag tag : tags) {
 
  115                 if (tag.getName().equals(
"script")) {                
 
  118                     scripts.append(numScripts).append(
") ");
 
  119                     if (tag.getTagContent().length() > 0) {
 
  120                         scripts.append(tag.getTagContent()).append(
" ");
 
  123                     scripts.append(tag.getElement().getContent()).append(
"\n");
 
  125                 } 
else if (tag.getName().equals(
"a")) {
 
  128                     links.append(numLinks).append(
") ");
 
  129                     links.append(tag.getTagContent()).append(
"\n");
 
  131                 } 
else if (tag.getName().equals(
"img")) {
 
  134                     images.append(numImages).append(
") ");
 
  135                     images.append(tag.getTagContent()).append(
"\n");
 
  137                 } 
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
 
  139                     comments.append(numComments).append(
") ");
 
  140                     comments.append(tag.getTagContent()).append(
"\n");
 
  144                     Attributes atts = tag.getAttributes();
 
  145                     if (atts != null && atts.length() > 0) {
 
  147                         others.append(numOthers).append(
") ");
 
  148                         others.append(tag.getName()).append(
":");
 
  149                         others.append(tag.getTagContent()).append(
"\n");
 
  155             if (numScripts > 0) {
 
  156                 metadataMap.put(
"Scripts", scripts.toString());
 
  159                 metadataMap.put(
"Links", links.toString());
 
  162                 metadataMap.put(
"Images", images.toString());
 
  164             if (numComments > 0) {
 
  165                 metadataMap.put(
"Comments", comments.toString());
 
  168                 metadataMap.put(
"Others", others.toString());
 
  170         } 
catch (IOException ex) {
 
  171             logger.log(Level.WARNING, 
"Error extracting HTML metadata from content.", ex);
 
  187     public Reader getReader() throws InitReaderException {
 
  190         ReadContentInputStream stream = 
new ReadContentInputStream(file);
 
  194             Source source = 
new Source(stream);
 
  195             source.fullSequentialParse();
 
  196             Renderer renderer = source.getRenderer();
 
  197             renderer.setNewLine(
"\n");
 
  198             renderer.setIncludeHyperlinkURLs(
false);
 
  199             renderer.setDecorateFontStyles(
false);
 
  200             renderer.setIncludeAlternateText(
false);
 
  201             renderer.setMaxLineLength(0); 
 
  202             return new StringReader(renderer.toString());
 
  203         } 
catch (Throwable ex) {
 
  206             logger.log(Level.WARNING, 
"Error extracting HTML from content.", ex);
 
  207             throw new InitReaderException(
"Error extracting HTML from content.", ex);