Autopsy  4.7.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
40 import org.sleuthkit.datamodel.AbstractFile;
41 import org.sleuthkit.datamodel.BlackboardArtifact;
42 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
43 import org.sleuthkit.datamodel.BlackboardAttribute;
44 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
45 import org.sleuthkit.datamodel.Content;
46 import org.sleuthkit.datamodel.TskCoreException;
47 import org.w3c.dom.Document;
48 import org.w3c.dom.NamedNodeMap;
49 import org.w3c.dom.NodeList;
50 import org.xml.sax.SAXException;
51 
61 @NbBundle.Messages({
62  "cannotBuildXmlParser=Unable to build XML parser: ",
63  "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
64  "cannotParseXml=Unable to parse XML file: ",
65  "# {0} - file name", "SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}."
66 })
67 class SearchEngineURLQueryAnalyzer extends Extract {
68 
69  private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
70  private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
71  private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
72  private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
73 
74  private Content dataSource;
75  private IngestJobContext context;
76 
77  SearchEngineURLQueryAnalyzer() {
78  moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text");
79  }
80 
85  private static class KeyPair {
86 
87  private final String key;
88  private final String keyRegExp;
89 
90  KeyPair(String key, String keyRegExp) {
91  this.key = key;
92  this.keyRegExp = keyRegExp;
93  }
94 
95  String getKey() {
96  return key;
97  }
98 
99  String getKeyRegExp() {
100  return keyRegExp;
101  }
102 
103  }
104 
105  private static class SearchEngine {
106 
107  private final String engineName;
108  private final String domainSubstring;
109  private final List<KeyPair> keyPairs;
110  private int count;
111 
112  SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
113  this.engineName = engineName;
114  this.domainSubstring = domainSubstring;
115  this.keyPairs = keyPairs;
116  count = 0;
117  }
118 
119  void increment() {
120  ++count;
121  }
122 
123  String getEngineName() {
124  return engineName;
125  }
126 
127  String getDomainSubstring() {
128  return domainSubstring;
129  }
130 
131  int getTotal() {
132  return count;
133  }
134 
140  List<KeyPair> getKeys() {
141  return this.keyPairs;
142  }
143 
144  @Override
145  public String toString() {
146  String split = " ";
147  for (KeyPair kp : keyPairs) {
148  split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
149  }
150  return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
151  engineName, domainSubstring, count, split);
152  }
153  }
154 
155  private void loadConfigFile() throws IngestModuleException {
156  Document xmlinput;
157  try {
158  String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
159  File f = new File(path);
160  logger.log(Level.INFO, "Load successful"); //NON-NLS
161  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
162  DocumentBuilder db = dbf.newDocumentBuilder();
163  xmlinput = db.parse(f);
164 
165  if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
166  logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
167  }
168 
169  } catch (IOException e) {
170  throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e); //NON-NLS
171  } catch (ParserConfigurationException pce) {
172  throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce); //NON-NLS
173  } catch (SAXException sxe) {
174  throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe); //NON-NLS
175  }
176 
177  NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
178  SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
179  for (int i = 0; i < nlist.getLength(); i++) {
180  NamedNodeMap nnm = nlist.item(i).getAttributes();
181 
182  String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
183  String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
184  List<KeyPair> keys = new ArrayList<>();
185 
186  NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
187  for (int k = 0; k < listSplits.getLength(); k++) {
188  if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
189  keys.add(new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
190  }
191  }
192 
193  SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
194  listEngines[i] = Se;
195  }
196  engines = listEngines;
197  }
198 
209  private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
210  if (engines == null) {
211  return null;
212  }
213  for (SearchEngine engine : engines) {
214  if (domain.contains(engine.getDomainSubstring())) {
215  return engine;
216  }
217  }
218  return null;
219  }
220 
228  private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
229  String x = ""; //NON-NLS
230 
231  for (KeyPair kp : eng.getKeys()) {
232  if (url.contains(kp.getKey())) {
233  x = getValue(url, kp.getKeyRegExp());
234  break;
235  }
236  }
237  try { //try to decode the url
238  String decoded = URLDecoder.decode(x, "UTF-8"); //NON-NLS
239  return decoded;
240  } catch (UnsupportedEncodingException uee) { //if it fails, return the encoded string
241  logger.log(Level.FINE, "Error during URL decoding ", uee); //NON-NLS
242  return x;
243  }
244  }
245 
256  private String getValue(String url, String regExpKey) {
257  /*
258  * NOTE: This doesn't seem like the most wonderful way to do this, but
259  * we have data that has a bunch of bogus URLs. Such as: - Multiple
260  * google "q=" terms, including one after a "#" tag. Google used the
261  * last one - Search/query part of the URL starting with a '#'. Attemps
262  * at more formal approaches of splitting on the "?" and then on "&"
263  * resulting in missing things.
264  */
265  String value = ""; //NON-NLS
266  String v = regExpKey;
267  //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
268  if (regExpKey.contains("\\?")) {
269  v = regExpKey.replace("\\?", "?");
270  }
271  String[] sp = url.split(v);
272  if (sp.length >= 2) {
273  if (sp[sp.length - 1].contains("&")) {
274  value = sp[sp.length - 1].split("&")[0];
275  } else {
276  value = sp[sp.length - 1];
277  }
278  }
279  return value;
280  }
281 
282  private void findSearchQueries() {
283  int totalQueries = 0;
284  try {
285  //from blackboard_artifacts
286  Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS
287  + "' OR blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') "); //List of every 'web_history' and 'bookmark' artifact NON-NLS
288  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
289 
290  for (BlackboardArtifact artifact : listArtifacts) {
291  if (context.dataSourceIngestIsCancelled()) {
292  break; //User cancelled the process.
293  }
294 
295  //initializing default attributes
296  String query = "";
297  String searchEngineDomain = "";
298  String browser = "";
299  long last_accessed = -1;
300 
301  long fileId = artifact.getObjectID();
302  boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
303  if (!isFromSource) {
304  //File was from a different dataSource. Skipping.
305  continue;
306  }
307 
308  AbstractFile file = tskCase.getAbstractFileById(fileId);
309  if (file == null) {
310  continue;
311  }
312 
313  SearchEngineURLQueryAnalyzer.SearchEngine se = null;
314  //from blackboard_attributes
315  Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("WHERE artifact_id = " + artifact.getArtifactID()); //NON-NLS
316 
317  for (BlackboardAttribute attribute : listAttributes) {
318  if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
319  final String urlString = attribute.getValueString();
320  se = getSearchEngineFromUrl(urlString);
321  if (se == null) {
322  break;
323  }
324 
325  query = extractSearchEngineQuery(se, attribute.getValueString());
326  if (query.equals("")) //False positive match, artifact was not a query. NON-NLS
327  {
328  break;
329  }
330 
331  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
332  browser = attribute.getValueString();
333  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
334  searchEngineDomain = attribute.getValueString();
335  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
336  last_accessed = attribute.getValueLong();
337  }
338  }
339 
340  if (se != null && !query.equals("")) { //NON-NLS
341  // If date doesn't exist, change to 0 (instead of 1969)
342  if (last_accessed == -1) {
343  last_accessed = 0;
344  }
345  Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
346  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
347  NbBundle.getMessage(this.getClass(),
348  "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
349  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
350  NbBundle.getMessage(this.getClass(),
351  "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
352  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
353  NbBundle.getMessage(this.getClass(),
354  "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
355  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
356  NbBundle.getMessage(this.getClass(),
357  "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
358  this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes);
359  se.increment();
360  ++totalQueries;
361  }
362  }
363  } catch (TskCoreException e) {
364  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
365  } finally {
366  if (context.dataSourceIngestIsCancelled()) {
367  logger.info("Operation terminated by user."); //NON-NLS
368  }
369  IngestServices.getInstance().fireModuleDataEvent(new ModuleDataEvent(
370  NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"),
371  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY));
372  logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
373  }
374  }
375 
376  private String getTotals() {
377  String total = "";
378  if (engines == null) {
379  return total;
380  }
381  for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
382  total += se.getEngineName() + " : " + se.getTotal() + "\n";
383  }
384  return total;
385  }
386 
387  @Override
388  public void process(Content dataSource, IngestJobContext context) {
389  this.dataSource = dataSource;
390  this.context = context;
391  this.findSearchQueries();
392  logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
393  }
394 
395  @Override
396  void configExtractor() throws IngestModuleException {
397  try {
398  PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
399  } catch (IOException e) {
400  String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
401  logger.log(Level.SEVERE, message, e);
402  throw new IngestModuleException(message, e);
403  }
404  loadConfigFile();
405  }
406 
407  @Override
408  public void complete() {
409  logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS
410  }
411 }
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition: XMLUtil.java:177

Copyright © 2012-2016 Basis Technology. Generated on: Mon Jun 18 2018
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.