api-docs/4.0/_html_text_extractor_8java_source.html

 /*

  * Autopsy Forensic Browser

  *

  * Copyright 2012-2013 Basis Technology Corp.

  * Contact: carrier <at> sleuthkit <dot> org

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 package org.sleuthkit.autopsy.keywordsearch;


 import java.io.IOException;

 import java.io.InputStream;

 import java.io.Reader;

 import java.nio.charset.Charset;

 import java.util.Arrays;

 import java.util.List;

 import java.util.Map;

 import java.util.logging.Level;

 import org.sleuthkit.autopsy.coreutils.Logger;

 import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;

 import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;

 import org.sleuthkit.datamodel.AbstractFile;

 import org.sleuthkit.datamodel.ReadContentInputStream;


 class HtmlTextExtractor implements TextExtractor {


     private static final Logger logger = Logger.getLogger(HtmlTextExtractor.class.getName());

     private static Ingester ingester;

     static final Charset outCharset = Server.DEFAULT_INDEXED_TEXT_CHARSET;

     static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;

     private static final int SINGLE_READ_CHARS = 1024;

     private static final int EXTRA_CHARS = 128; //for whitespace

     private static final int MAX_SIZE = 50000000;

     //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM

     private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];

     private KeywordSearchIngestModule module;

     private AbstractFile sourceFile;

     private int numChunks = 0;


     static final List<String> WEB_MIME_TYPES = Arrays.asList(

             "application/javascript", //NON-NLS

             "application/xhtml+xml", //NON-NLS

             "application/json", //NON-NLS

             "text/css", //NON-NLS

             "text/html", //NON-NLS NON-NLS

             "text/javascript" //NON-NLS

     //"application/xml",

     //"application/xml-dtd",

     );


     HtmlTextExtractor(KeywordSearchIngestModule module) {

         this.module = module;

         ingester = Server.getIngester();

     }


     @Override

     public boolean setScripts(List<SCRIPT> extractScripts) {

         return false;

     }


     @Override

     public List<SCRIPT> getScripts() {

         return null;

     }


     @Override

     public Map<String, String> getOptions() {

         return null;

     }


     @Override

     public void setOptions(Map<String, String> options) {

     }


     @Override

     public int getNumChunks() {

         return numChunks;

     }


     @Override

     public AbstractFile getSourceFile() {

         return sourceFile;

     }


     @Override

     public boolean index(AbstractFile sourceFile) throws IngesterException {

         this.sourceFile = sourceFile;

         numChunks = 0; //unknown until indexing is done


         boolean success = false;

         Reader reader = null;


         final InputStream stream = new ReadContentInputStream(sourceFile);


         try {

             // Parse the stream with Jericho

             JerichoParserWrapper jpw = new JerichoParserWrapper(stream);

             jpw.parse();

             reader = jpw.getReader();


             // In case there is an exception or parse() isn't called

             if (reader == null) {

                 logger.log(Level.WARNING, "No reader available from HTML parser"); //NON-NLS

                 return false;

             }


             success = true;

             long readSize;

             long totalRead = 0;

             boolean eof = false;

             //we read max 1024 chars at time, this seems to max what this Reader would return

             while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {

                 totalRead += readSize;


                 //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)

                 while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)

                         && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {

                     totalRead += readSize;

                 }

                 if (readSize == -1) {

                     //this is the last chunk

                     eof = true;

                 } else {

                     //try to read until whitespace to not break words

                     while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)

                             && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])

                             && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {

                         totalRead += readSize;

                     }

                     if (readSize == -1) {

                         //this is the last chunk

                         eof = true;

                     }

                 }


                 //logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());

                 //encode to bytes to index as byte stream

                 String extracted;


                 //add BOM and trim the 0 bytes

                 //set initial size to chars read + bom - try to prevent from resizing

                 StringBuilder sb = new StringBuilder((int) totalRead + 1000);

                 //inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM

                 //sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika

                 if (totalRead < MAX_EXTR_TEXT_CHARS) {

                     sb.append(textChunkBuf, 0, (int) totalRead);

                 } else {

                     sb.append(textChunkBuf);

                 }


                 //reset for next chunk

                 totalRead = 0;

                 extracted = sb.toString();


                 //converts BOM automatically to charSet encoding

                 byte[] encodedBytes = extracted.getBytes(outCharset);

                 AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);

                 try {

                     chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);

                     ++this.numChunks;

                 } catch (Ingester.IngesterException ingEx) {

                     success = false;

                     logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS

                             + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS

                     throw ingEx; //need to rethrow/return to signal error and move on

                 }

             }

         } catch (IOException ex) {

             logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS

             success = false;

         } catch (Exception ex) {

             logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS

             success = false;

         } finally {

             try {

                 stream.close();

             } catch (IOException ex) {

                 logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS

             }

             try {

                 if (reader != null) {

                     reader.close();

                 }

             } catch (IOException ex) {

                 logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS

             }

         }


         //after all chunks, ingest the parent file without content itself, and store numChunks

         ingester.ingest(this);


         return success;

     }


     @Override

     public boolean isContentTypeSpecific() {

         return true;

     }


     @Override

     public boolean isSupported(AbstractFile file, String detectedFormat) {

         if (detectedFormat == null) {

             return false;

         } else if (WEB_MIME_TYPES.contains(detectedFormat) && file.getSize() <= MAX_SIZE) {

             return true;

         } else {

             return false;

         }


     }

 }

org.sleuthkit

org.sleuthkit.autopsy.coreutils.StringExtract
Definition: StringExtract.java:43

org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable
Definition: StringExtract.java:654

org

org.sleuthkit.autopsy.coreutils
Definition: AutopsyExceptionHandler.java:19

org.sleuthkit.autopsy.coreutils.Logger
Definition: Logger.java:36

org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
Definition: Ingester.java:450

org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT
Definition: StringExtract.java:664

org.sleuthkit.autopsy.keywordsearch.Ingester
Definition: Ingester.java:50

org.sleuthkit.autopsy

org.sleuthkit.autopsy.keywordsearch
Definition: AbstractFileChunk.java:19