Autopsy  4.10.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
41 import org.sleuthkit.datamodel.AbstractFile;
42 import org.sleuthkit.datamodel.BlackboardArtifact;
43 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
44 import org.sleuthkit.datamodel.BlackboardAttribute;
45 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
46 import org.sleuthkit.datamodel.Content;
47 import org.sleuthkit.datamodel.TskCoreException;
48 import org.w3c.dom.Document;
49 import org.w3c.dom.NamedNodeMap;
50 import org.w3c.dom.NodeList;
51 import org.xml.sax.SAXException;
52 
62 @NbBundle.Messages({
63  "cannotBuildXmlParser=Unable to build XML parser: ",
64  "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
65  "cannotParseXml=Unable to parse XML file: ",
66  "# {0} - file name", "SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
67  "Progress_Message_Find_Search_Query=Find Search Queries"
68 })
69 class SearchEngineURLQueryAnalyzer extends Extract {
70 
71  private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
72  private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
73  private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
74  private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
75 
76  private Content dataSource;
77  private IngestJobContext context;
78 
79  SearchEngineURLQueryAnalyzer() {
80  moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text");
81  }
82 
87  private static class KeyPair {
88 
89  private final String key;
90  private final String keyRegExp;
91 
92  KeyPair(String key, String keyRegExp) {
93  this.key = key;
94  this.keyRegExp = keyRegExp;
95  }
96 
97  String getKey() {
98  return key;
99  }
100 
101  String getKeyRegExp() {
102  return keyRegExp;
103  }
104 
105  }
106 
107  private static class SearchEngine {
108 
109  private final String engineName;
110  private final String domainSubstring;
111  private final List<KeyPair> keyPairs;
112  private int count;
113 
114  SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
115  this.engineName = engineName;
116  this.domainSubstring = domainSubstring;
117  this.keyPairs = keyPairs;
118  count = 0;
119  }
120 
121  void increment() {
122  ++count;
123  }
124 
125  String getEngineName() {
126  return engineName;
127  }
128 
129  String getDomainSubstring() {
130  return domainSubstring;
131  }
132 
133  int getTotal() {
134  return count;
135  }
136 
142  List<KeyPair> getKeys() {
143  return this.keyPairs;
144  }
145 
146  @Override
147  public String toString() {
148  String split = " ";
149  for (KeyPair kp : keyPairs) {
150  split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
151  }
152  return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
153  engineName, domainSubstring, count, split);
154  }
155  }
156 
157  private void loadConfigFile() throws IngestModuleException {
158  Document xmlinput;
159  try {
160  String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
161  File f = new File(path);
162  logger.log(Level.INFO, "Load successful"); //NON-NLS
163  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
164  DocumentBuilder db = dbf.newDocumentBuilder();
165  xmlinput = db.parse(f);
166 
167  if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
168  logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
169  }
170 
171  } catch (IOException e) {
172  throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e); //NON-NLS
173  } catch (ParserConfigurationException pce) {
174  throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce); //NON-NLS
175  } catch (SAXException sxe) {
176  throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe); //NON-NLS
177  }
178 
179  NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
180  SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
181  for (int i = 0; i < nlist.getLength(); i++) {
182  NamedNodeMap nnm = nlist.item(i).getAttributes();
183 
184  String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
185  String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
186  List<KeyPair> keys = new ArrayList<>();
187 
188  NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
189  for (int k = 0; k < listSplits.getLength(); k++) {
190  if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
191  keys.add(new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
192  }
193  }
194 
195  SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
196  listEngines[i] = Se;
197  }
198  engines = listEngines;
199  }
200 
211  private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
212  if (engines == null) {
213  return null;
214  }
215  for (SearchEngine engine : engines) {
216  if (domain.contains(engine.getDomainSubstring())) {
217  return engine;
218  }
219  }
220  return null;
221  }
222 
230  private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
231  String x = ""; //NON-NLS
232 
233  for (KeyPair kp : eng.getKeys()) {
234  if (url.contains(kp.getKey())) {
235  x = getValue(url, kp.getKeyRegExp());
236  break;
237  }
238  }
239  try { //try to decode the url
240  String decoded = URLDecoder.decode(x, "UTF-8"); //NON-NLS
241  return decoded;
242  } catch (UnsupportedEncodingException exception) { //if it fails, return the encoded string
243  logger.log(Level.FINE, "Error during URL decoding, returning undecoded value:"
244  + "\n\tURL: " + url
245  + "\n\tUndecoded value: " + x
246  + "\n\tEngine name: " + eng.getEngineName()
247  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS
248  return x;
249  } catch (IllegalArgumentException exception) { //if it fails, return the encoded string
250  logger.log(Level.SEVERE, "Illegal argument passed to URL decoding, returning undecoded value:"
251  + "\n\tURL: " + url
252  + "\n\tUndecoded value: " + x
253  + "\n\tEngine name: " + eng.getEngineName()
254  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS)
255  return x;
256  }
257  }
258 
269  private String getValue(String url, String regExpKey) {
270  /*
271  * NOTE: This doesn't seem like the most wonderful way to do this, but
272  * we have data that has a bunch of bogus URLs. Such as: - Multiple
273  * google "q=" terms, including one after a "#" tag. Google used the
274  * last one - Search/query part of the URL starting with a '#'. Attemps
275  * at more formal approaches of splitting on the "?" and then on "&"
276  * resulting in missing things.
277  */
278  String value = ""; //NON-NLS
279  String v = regExpKey;
280  //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
281  if (regExpKey.contains("\\?")) {
282  v = regExpKey.replace("\\?", "?");
283  }
284  String[] sp = url.split(v);
285  if (sp.length >= 2) {
286  if (sp[sp.length - 1].contains("&")) {
287  value = sp[sp.length - 1].split("&")[0];
288  } else {
289  value = sp[sp.length - 1];
290  }
291  }
292  return value;
293  }
294 
295  private void findSearchQueries() {
296  int totalQueries = 0;
297  try {
298  //from blackboard_artifacts
299  Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS
300  + "' OR blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') "); //List of every 'web_history' and 'bookmark' artifact NON-NLS
301  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
302 
303  for (BlackboardArtifact artifact : listArtifacts) {
304  if (context.dataSourceIngestIsCancelled()) {
305  break; //User cancelled the process.
306  }
307 
308  //initializing default attributes
309  String query = "";
310  String searchEngineDomain = "";
311  String browser = "";
312  long last_accessed = -1;
313 
314  long fileId = artifact.getObjectID();
315  boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
316  if (!isFromSource) {
317  //File was from a different dataSource. Skipping.
318  continue;
319  }
320 
321  AbstractFile file = tskCase.getAbstractFileById(fileId);
322  if (file == null) {
323  continue;
324  }
325 
326  SearchEngineURLQueryAnalyzer.SearchEngine se = null;
327  //from blackboard_attributes
328  Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("WHERE artifact_id = " + artifact.getArtifactID()); //NON-NLS
329 
330  for (BlackboardAttribute attribute : listAttributes) {
331  if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
332  final String urlString = attribute.getValueString();
333  se = getSearchEngineFromUrl(urlString);
334  if (se == null) {
335  break;
336  }
337 
338  query = extractSearchEngineQuery(se, attribute.getValueString());
339  if (query.equals("")) //False positive match, artifact was not a query. NON-NLS
340  {
341  break;
342  }
343 
344  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
345  browser = attribute.getValueString();
346  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
347  searchEngineDomain = attribute.getValueString();
348  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
349  last_accessed = attribute.getValueLong();
350  }
351  }
352 
353  if (se != null && !query.equals("")) { //NON-NLS
354  // If date doesn't exist, change to 0 (instead of 1969)
355  if (last_accessed == -1) {
356  last_accessed = 0;
357  }
358  Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
359  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
360  NbBundle.getMessage(this.getClass(),
361  "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
362  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
363  NbBundle.getMessage(this.getClass(),
364  "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
365  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
366  NbBundle.getMessage(this.getClass(),
367  "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
368  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
369  NbBundle.getMessage(this.getClass(),
370  "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
371  this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes);
372  se.increment();
373  ++totalQueries;
374  }
375  }
376  } catch (TskCoreException e) {
377  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
378  } finally {
379  if (context.dataSourceIngestIsCancelled()) {
380  logger.info("Operation terminated by user."); //NON-NLS
381  }
382  IngestServices.getInstance().fireModuleDataEvent(new ModuleDataEvent(
383  NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"),
384  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY));
385  logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
386  }
387  }
388 
389  private String getTotals() {
390  String total = "";
391  if (engines == null) {
392  return total;
393  }
394  for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
395  total += se.getEngineName() + " : " + se.getTotal() + "\n";
396  }
397  return total;
398  }
399 
400  @Override
401  public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
402  this.dataSource = dataSource;
403  this.context = context;
404 
405  progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
406  this.findSearchQueries();
407  logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
408  }
409 
410  @Override
411  void configExtractor() throws IngestModuleException {
412  try {
413  PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
414  } catch (IOException e) {
415  String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
416  logger.log(Level.SEVERE, message, e);
417  throw new IngestModuleException(message, e);
418  }
419  loadConfigFile();
420  }
421 
422  @Override
423  public void complete() {
424  logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS
425  }
426 }
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition: XMLUtil.java:177

Copyright © 2012-2018 Basis Technology. Generated on: Fri Mar 22 2019
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.