19 package org.sleuthkit.autopsy.recentactivity;
 
   22 import java.io.IOException;
 
   23 import java.io.UnsupportedEncodingException;
 
   24 import java.net.URLDecoder;
 
   25 import java.util.Arrays;
 
   26 import java.util.ArrayList;
 
   27 import java.util.Collection;
 
   28 import java.util.HashSet;
 
   29 import java.util.List;
 
   30 import java.util.logging.Level;
 
   31 import java.util.regex.Matcher;
 
   32 import java.util.regex.Pattern;
 
   34 import javax.xml.parsers.DocumentBuilder;
 
   35 import javax.xml.parsers.DocumentBuilderFactory;
 
   36 import javax.xml.parsers.ParserConfigurationException;
 
   37 import org.openide.util.NbBundle;
 
   51 import org.w3c.dom.Document;
 
   52 import org.w3c.dom.NamedNodeMap;
 
   53 import org.w3c.dom.NodeList;
 
   54 import org.xml.sax.SAXException;
 
   64     "cannotBuildXmlParser=Unable to build XML parser: ",
 
   65     "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
 
   66     "cannotParseXml=Unable to parse XML file: ",
 
   67     "# {0} - file name", 
"SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
 
   68     "Progress_Message_Find_Search_Query=Find Search Queries" 
   70 class SearchEngineURLQueryAnalyzer extends Extract {
 
   72     private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
 
   73     private static final String XMLFILE = 
"SEUQAMappings.xml"; 
 
   74     private static final String XSDFILE = 
"SearchEngineSchema.xsd"; 
 
   75     private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
 
   77     private Content dataSource;
 
   78     private final IngestJobContext context;
 
   80     SearchEngineURLQueryAnalyzer(IngestJobContext context) {
 
   81         super(NbBundle.getMessage(ExtractIE.class, 
"SearchEngineURLQueryAnalyzer.moduleName.text"), context);
 
   82         this.context = context;
 
   91         private final String 
key;
 
   94         KeyPair(String key, String keyRegExp) {
 
   96             this.keyRegExp = keyRegExp;
 
  103         String getKeyRegExp() {
 
  117         SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
 
  118             this.engineName = engineName;
 
  119             this.domainSubstring = domainSubstring;
 
  120             domainRegexPattern = Pattern.compile(
"^(.*[./])?" + domainSubstring + 
"([./].*)?$");
 
  121             this.keyPairs = keyPairs;
 
  129         String getEngineName() {
 
  133         String getDomainSubstring() {
 
  134             return domainSubstring;
 
  137         Pattern getDomainRegexPattern() {
 
  138             return domainRegexPattern;
 
  150         List<KeyPair> getKeys() {
 
  151             return this.keyPairs;
 
  158                 split = split + 
"[ " + kp.getKey() + 
" :: " + kp.getKeyRegExp() + 
" ]" + 
", ";
 
  160             return NbBundle.getMessage(this.getClass(), 
"SearchEngineURLQueryAnalyzer.toString",
 
  161                     engineName, domainSubstring, count, split);
 
  169             File f = 
new File(path);
 
  170             logger.log(Level.INFO, 
"Load successful"); 
 
  171             DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
 
  172             DocumentBuilder db = dbf.newDocumentBuilder();
 
  173             xmlinput = db.parse(f);
 
  175             if (!
XMLUtil.
xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
 
  176                 logger.log(Level.WARNING, 
"Error loading Search Engines: could not validate against [" + XSDFILE + 
"], results may not be accurate."); 
 
  179         } 
catch (IOException e) {
 
  180             throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e); 
 
  181         } 
catch (ParserConfigurationException pce) {
 
  182             throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce); 
 
  183         } 
catch (SAXException sxe) {
 
  184             throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe); 
 
  187         NodeList nlist = xmlinput.getElementsByTagName(
"SearchEngine"); 
 
  188         SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = 
new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
 
  189         for (
int i = 0; i < nlist.getLength(); i++) {
 
  190             NamedNodeMap nnm = nlist.item(i).getAttributes();
 
  192             String EngineName = nnm.getNamedItem(
"engine").getNodeValue(); 
 
  193             String EnginedomainSubstring = nnm.getNamedItem(
"domainSubstring").getNodeValue(); 
 
  194             List<KeyPair> keys = 
new ArrayList<>();
 
  196             NodeList listSplits = xmlinput.getElementsByTagName(
"splitToken"); 
 
  197             for (
int k = 0; k < listSplits.getLength(); k++) {
 
  198                 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem(
"engine").getNodeValue().equals(EngineName)) { 
 
  199                     keys.add(
new KeyPair(listSplits.item(k).getAttributes().getNamedItem(
"plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem(
"regexToken").getNodeValue())); 
 
  203             SearchEngineURLQueryAnalyzer.SearchEngine Se = 
new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
 
  206         engines = listEngines;
 
  219     private static Collection<SearchEngineURLQueryAnalyzer.SearchEngine> getSearchEngineFromUrl(String domain) {
 
  220         List<SearchEngineURLQueryAnalyzer.SearchEngine> supportedEngines = 
new ArrayList<>();
 
  221         if (engines == null) {
 
  222             return supportedEngines;
 
  224         for (SearchEngine engine : engines) {
 
  225             Matcher matcher = engine.getDomainRegexPattern().matcher(domain);
 
  226             if (matcher.matches()) {
 
  227                 supportedEngines.add(engine);
 
  230         return supportedEngines;
 
  240     private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
 
  243         for (KeyPair kp : eng.getKeys()) {
 
  244             if (url.contains(kp.getKey())) {
 
  245                 x = getValue(url, kp.getKeyRegExp());
 
  250             String decoded = URLDecoder.decode(x.replaceAll(
"%(?![0-9a-fA-F]{2})", 
"%25"), 
"UTF-8"); 
 
  252         } 
catch (UnsupportedEncodingException exception) { 
 
  253             logger.log(Level.FINE, 
"Error during URL decoding, returning undecoded value:" 
  255                     + 
"\n\tUndecoded value: " + x
 
  256                     + 
"\n\tEngine name: " + eng.getEngineName()
 
  257                     + 
"\n\tEngine domain: " + eng.getDomainSubstring(), exception); 
 
  259         } 
catch (IllegalArgumentException exception) { 
 
  260             logger.log(Level.SEVERE, 
"Illegal argument passed to URL decoding, returning undecoded value:" 
  262                     + 
"\n\tUndecoded value: " + x
 
  263                     + 
"\n\tEngine name: " + eng.getEngineName()
 
  264                     + 
"\n\tEngine domain: " + eng.getDomainSubstring(), exception); 
 
  279     private String getValue(String url, String regExpKey) {
 
  289         String v = regExpKey;
 
  291         if (regExpKey.contains(
"\\?")) {
 
  292             v = regExpKey.replace(
"\\?", 
"?");
 
  294         String[] sp = url.split(v);
 
  295         if (sp.length >= 2) {
 
  296             if (sp[sp.length - 1].contains(
"&")) {
 
  297                 value = sp[sp.length - 1].split(
"&")[0];
 
  299                 value = sp[sp.length - 1];
 
  305     private void findSearchQueries() {
 
  306         int totalQueries = 0;
 
  309             Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
 
  310                     Arrays.asList(
new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_BOOKMARK), 
new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
 
  311                     Arrays.asList(dataSource.getId()));
 
  312             logger.log(Level.INFO, 
"Processing {0} blackboard artifacts.", listArtifacts.size()); 
 
  314             for (BlackboardArtifact artifact : listArtifacts) {
 
  315                 if (context.dataSourceIngestIsCancelled()) {
 
  320                 String searchEngineDomain = 
"";
 
  322                 long last_accessed = -1;
 
  324                 AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
 
  330                 Set<String> searchQueries = 
new HashSet<>();
 
  331                 BlackboardAttribute urlAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
 
  332                 if (urlAttr == null) {
 
  336                 final String urlString = urlAttr.getValueString();
 
  337                 Collection<SearchEngineURLQueryAnalyzer.SearchEngine> possibleSearchEngines = getSearchEngineFromUrl(urlString);
 
  338                 for (SearchEngineURLQueryAnalyzer.SearchEngine se : possibleSearchEngines) {
 
  339                     String query = extractSearchEngineQuery(se, urlString);
 
  341                     if (!query.equals(
"")) {
 
  342                         searchQueries.add(query);
 
  348                 if (searchQueries.isEmpty()) {
 
  353                 BlackboardAttribute browserAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME));
 
  354                 if (browserAttr != null) {
 
  355                     browser = browserAttr.getValueString();
 
  357                 BlackboardAttribute domainAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
 
  358                 if (domainAttr != null) {
 
  359                     searchEngineDomain = domainAttr.getValueString();
 
  361                 BlackboardAttribute lastAccessAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED));
 
  362                 if (lastAccessAttr != null) {
 
  363                     last_accessed = lastAccessAttr.getValueLong();
 
  367                 for (String query : searchQueries) {
 
  369                     if (last_accessed == -1) {
 
  372                     Collection<BlackboardAttribute> bbattributes = 
new ArrayList<>();
 
  373                     bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
 
  374                             NbBundle.getMessage(
this.getClass(),
 
  375                                     "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
 
  376                     bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
 
  377                             NbBundle.getMessage(
this.getClass(),
 
  378                                     "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
 
  379                     bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
 
  380                             NbBundle.getMessage(
this.getClass(),
 
  381                                     "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
 
  382                     bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
 
  383                             NbBundle.getMessage(
this.getClass(),
 
  384                                     "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
 
  385                     postArtifact(createArtifactWithAttributes(BlackboardArtifact.Type.TSK_WEB_SEARCH_QUERY, file, bbattributes));
 
  389         } 
catch (TskCoreException e) {
 
  390             logger.log(Level.SEVERE, 
"Encountered error retrieving artifacts for search engine queries", e); 
 
  392             if (context.dataSourceIngestIsCancelled()) {
 
  393                 logger.info(
"Operation terminated by user."); 
 
  395             logger.log(Level.INFO, 
"Extracted {0} queries from the blackboard", totalQueries); 
 
  399     private String getTotals() {
 
  401         if (engines == null) {
 
  404         for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
 
  405             total += se.getEngineName() + 
" : " + se.getTotal() + 
"\n";
 
  411     public void process(Content dataSource, DataSourceIngestModuleProgress progressBar) {
 
  412         this.dataSource = dataSource;
 
  414         progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
 
  415         this.findSearchQueries();
 
  416         logger.log(Level.INFO, 
"Search Engine stats: \n{0}", getTotals()); 
 
  420     void startUp() throws IngestModuleException {
 
  422             PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, 
true);
 
  423         } 
catch (IOException e) {
 
  424             String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
 
  425             logger.log(Level.SEVERE, message, e);
 
  426             throw new IngestModuleException(message, e);
 
final String domainSubstring
 
final List< KeyPair > keyPairs
 
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
 
final Pattern domainRegexPattern