19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.Reader;
 
   23 import java.io.StringReader;
 
   24 import java.util.Arrays;
 
   25 import java.util.List;
 
   26 import java.util.logging.Level;
 
   27 import net.htmlparser.jericho.Attributes;
 
   28 import net.htmlparser.jericho.Renderer;
 
   29 import net.htmlparser.jericho.Source;
 
   30 import net.htmlparser.jericho.StartTag;
 
   31 import net.htmlparser.jericho.StartTagType;
 
   39 class HtmlTextExtractor 
extends FileTextExtractor {
 
   41     static final private Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());
 
   42     private static final int MAX_SIZE = 50_000_000; 
 
   44     static final List<String> WEB_MIME_TYPES = Arrays.asList(
 
   45             "application/javascript", 
 
   46             "application/xhtml+xml", 
 
   54     boolean isContentTypeSpecific() {
 
   59     boolean isSupported(AbstractFile file, String detectedFormat) {
 
   60         return detectedFormat != null
 
   61                 && WEB_MIME_TYPES.contains(detectedFormat)
 
   62                 && file.getSize() <= MAX_SIZE;
 
   66     public Reader getReader(AbstractFile sourceFile) 
throws TextExtractorException {
 
   67         ReadContentInputStream stream = 
new ReadContentInputStream(sourceFile);
 
   71             StringBuilder scripts = 
new StringBuilder();
 
   72             StringBuilder links = 
new StringBuilder();
 
   73             StringBuilder images = 
new StringBuilder();
 
   74             StringBuilder comments = 
new StringBuilder();
 
   75             StringBuilder others = 
new StringBuilder();
 
   82             Source source = 
new Source(stream);
 
   83             source.fullSequentialParse();
 
   84             Renderer renderer = source.getRenderer();
 
   85             renderer.setNewLine(
"\n");
 
   86             renderer.setIncludeHyperlinkURLs(
false);
 
   87             renderer.setDecorateFontStyles(
false);
 
   88             renderer.setIncludeAlternateText(
false);
 
   90             String text = renderer.toString();
 
   92             List<StartTag> tags = source.getAllStartTags();
 
   94             StringBuilder stringBuilder = 
new StringBuilder();
 
   95             for (StartTag tag : tags) {
 
   96                 if (tag.getName().equals(
"script")) {                
 
   99                     scripts.append(numScripts).append(
") ");
 
  100                     if (tag.getTagContent().length() > 0) {
 
  101                         scripts.append(tag.getTagContent()).append(
" ");
 
  104                     scripts.append(tag.getElement().getContent()).append(
"\n");
 
  106                 } 
else if (tag.getName().equals(
"a")) {
 
  109                     links.append(numLinks).append(
") ");
 
  110                     links.append(tag.getTagContent()).append(
"\n");
 
  112                 } 
else if (tag.getName().equals(
"img")) {
 
  115                     images.append(numImages).append(
") ");
 
  116                     images.append(tag.getTagContent()).append(
"\n");
 
  118                 } 
else if (tag.getTagType().equals(StartTagType.COMMENT)) {
 
  120                     comments.append(numComments).append(
") ");
 
  121                     comments.append(tag.getTagContent()).append(
"\n");
 
  125                     Attributes atts = tag.getAttributes();
 
  126                     if (atts != null && atts.length() > 0) {
 
  128                         others.append(numOthers).append(
") ");
 
  129                         others.append(tag.getName()).append(
":");
 
  130                         others.append(tag.getTagContent()).append(
"\n");
 
  135             stringBuilder.append(text).append(
"\n\n");
 
  136             stringBuilder.append(
"----------NONVISIBLE TEXT----------\n\n"); 
 
  137             if (numScripts > 0) {
 
  138                 stringBuilder.append(
"---Scripts---\n"); 
 
  139                 stringBuilder.append(scripts).append(
"\n");
 
  142                 stringBuilder.append(
"---Links---\n"); 
 
  143                 stringBuilder.append(links).append(
"\n");
 
  146                 stringBuilder.append(
"---Images---\n"); 
 
  147                 stringBuilder.append(images).append(
"\n");
 
  149             if (numComments > 0) {
 
  150                 stringBuilder.append(
"---Comments---\n"); 
 
  151                 stringBuilder.append(comments).append(
"\n");
 
  154                 stringBuilder.append(
"---Others---\n"); 
 
  155                 stringBuilder.append(others).append(
"\n");
 
  158             return new StringReader(stringBuilder.toString());
 
  159         } 
catch (IOException ex) {
 
  160             throw new TextExtractorException(
"Error extracting HTML from content.", ex);
 
  165     public boolean isDisabled() {
 
  169     public void logWarning(
final String msg, Exception ex) {
 
  170         logger.log(Level.WARNING, msg, ex);