19 package org.sleuthkit.autopsy.keywordsearch;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import java.io.Reader;
 
   24 import java.io.StringReader;
 
   25 import java.util.List;
 
   26 import java.util.logging.Level;
 
   28 import net.htmlparser.jericho.Attributes;
 
   29 import net.htmlparser.jericho.Renderer;
 
   30 import net.htmlparser.jericho.Source;
 
   31 import net.htmlparser.jericho.StartTag;
 
   32 import net.htmlparser.jericho.StartTagType;
 
   39 class JerichoParserWrapper {
 
   40     private static final Logger logger = Logger.
getLogger(JerichoParserWrapper.class.getName());
 
   41     private InputStream in;    
 
   42     private StringBuilder out;
 
   43     private Reader reader;
 
   45     JerichoParserWrapper(InputStream in) {
 
   54     public Reader getReader() {
 
   63         out = 
new StringBuilder();
 
   66             Source source = 
new Source(in);
 
   67             source.fullSequentialParse();
 
   70             StringBuilder scripts = 
new StringBuilder();
 
   71             StringBuilder links = 
new StringBuilder();
 
   72             StringBuilder images = 
new StringBuilder();
 
   73             StringBuilder comments = 
new StringBuilder();
 
   74             StringBuilder others = 
new StringBuilder();
 
   81             text = renderHTMLAsPlainText(source);
 
   84             List<StartTag> tags = source.getAllStartTags();
 
   85             for(StartTag tag : tags) {
 
   86                 if(tag.getName().equals(
"script")) { 
 
   88                     scripts.append(numScripts).append(
") ");
 
   89                     if(tag.getTagContent().length()>0) {
 
   90                         scripts.append(tag.getTagContent()).append(
" ");
 
   93                     scripts.append(tag.getElement().getContent()).append(
"\n");
 
   95                 } 
else if(tag.getName().equals(
"a")) { 
 
   96                     links.append(numLinks).append(
") ");
 
   97                     links.append(tag.getTagContent()).append(
"\n");
 
   99                 } 
else if(tag.getName().equals(
"img")) { 
 
  100                     images.append(numImages).append(
") ");
 
  101                     images.append(tag.getTagContent()).append(
"\n");
 
  103                 } 
else if(tag.getTagType().equals(StartTagType.COMMENT)) {
 
  104                     comments.append(numComments).append(
") ");
 
  105                     comments.append(tag.getTagContent()).append(
"\n");
 
  109                     Attributes atts = tag.getAttributes();
 
  110                     if (atts!=null && atts.length()>0) {
 
  111                         others.append(numOthers).append(
") ");
 
  112                         others.append(tag.getName()).append(
":");
 
  113                         others.append(tag.getTagContent()).append(
"\n");
 
  119             out.append(text).append(
"\n\n");
 
  121             out.append(
"----------NONVISIBLE TEXT----------\n\n"); 
 
  123                 out.append(
"---Scripts---\n"); 
 
  124                 out.append(scripts.toString()).append(
"\n");
 
  126                 out.append(
"---Links---\n"); 
 
  127                 out.append(links.toString()).append(
"\n");
 
  129                 out.append(
"---Images---\n"); 
 
  130                 out.append(images.toString()).append(
"\n");
 
  131             } 
if(numComments>1) {
 
  132                 out.append(
"---Comments---\n"); 
 
  133                 out.append(comments.toString()).append(
"\n");
 
  135                 out.append(
"---Others---\n"); 
 
  136                 out.append(others.toString()).append(
"\n");
 
  139             reader = 
new StringReader(out.toString());
 
  140         } 
catch (IOException ex) {
 
  141             logger.log(Level.WARNING, 
"Unable to parse the HTML file", ex); 
 
  147     private String renderHTMLAsPlainText(Source source) {
 
  148         Renderer renderer = source.getRenderer();
 
  149         renderer.setNewLine(
"\n");
 
  150         renderer.setIncludeHyperlinkURLs(
false);
 
  151         renderer.setDecorateFontStyles(
false);
 
  152         renderer.setIncludeAlternateText(
false);
 
  153         return renderer.toString();
 
static Logger getLogger(String name)