Autopsy  4.14.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2014 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
39 import org.sleuthkit.datamodel.AbstractFile;
40 import org.sleuthkit.datamodel.BlackboardArtifact;
41 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
42 import org.sleuthkit.datamodel.BlackboardAttribute;
43 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
44 import org.sleuthkit.datamodel.Content;
45 import org.sleuthkit.datamodel.TskCoreException;
46 import org.w3c.dom.Document;
47 import org.w3c.dom.NamedNodeMap;
48 import org.w3c.dom.NodeList;
49 import org.xml.sax.SAXException;
50 
60 @NbBundle.Messages({
61  "cannotBuildXmlParser=Unable to build XML parser: ",
62  "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
63  "cannotParseXml=Unable to parse XML file: ",
64  "# {0} - file name", "SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
65  "Progress_Message_Find_Search_Query=Find Search Queries"
66 })
67 class SearchEngineURLQueryAnalyzer extends Extract {
68 
69  private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
70  private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
71  private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
72  private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
73 
74  private Content dataSource;
75  private IngestJobContext context;
76 
77  SearchEngineURLQueryAnalyzer() {
78  moduleName = NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text");
79  }
80 
85  private static class KeyPair {
86 
87  private final String key;
88  private final String keyRegExp;
89 
90  KeyPair(String key, String keyRegExp) {
91  this.key = key;
92  this.keyRegExp = keyRegExp;
93  }
94 
95  String getKey() {
96  return key;
97  }
98 
99  String getKeyRegExp() {
100  return keyRegExp;
101  }
102 
103  }
104 
105  private static class SearchEngine {
106 
107  private final String engineName;
108  private final String domainSubstring;
109  private final List<KeyPair> keyPairs;
110  private int count;
111 
112  SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
113  this.engineName = engineName;
114  this.domainSubstring = domainSubstring;
115  this.keyPairs = keyPairs;
116  count = 0;
117  }
118 
119  void increment() {
120  ++count;
121  }
122 
123  String getEngineName() {
124  return engineName;
125  }
126 
127  String getDomainSubstring() {
128  return domainSubstring;
129  }
130 
131  int getTotal() {
132  return count;
133  }
134 
140  List<KeyPair> getKeys() {
141  return this.keyPairs;
142  }
143 
144  @Override
145  public String toString() {
146  String split = " ";
147  for (KeyPair kp : keyPairs) {
148  split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
149  }
150  return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
151  engineName, domainSubstring, count, split);
152  }
153  }
154 
155  private void loadConfigFile() throws IngestModuleException {
156  Document xmlinput;
157  try {
158  String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
159  File f = new File(path);
160  logger.log(Level.INFO, "Load successful"); //NON-NLS
161  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
162  DocumentBuilder db = dbf.newDocumentBuilder();
163  xmlinput = db.parse(f);
164 
165  if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
166  logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
167  }
168 
169  } catch (IOException e) {
170  throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e); //NON-NLS
171  } catch (ParserConfigurationException pce) {
172  throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce); //NON-NLS
173  } catch (SAXException sxe) {
174  throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe); //NON-NLS
175  }
176 
177  NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
178  SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
179  for (int i = 0; i < nlist.getLength(); i++) {
180  NamedNodeMap nnm = nlist.item(i).getAttributes();
181 
182  String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
183  String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
184  List<KeyPair> keys = new ArrayList<>();
185 
186  NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
187  for (int k = 0; k < listSplits.getLength(); k++) {
188  if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
189  keys.add(new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
190  }
191  }
192 
193  SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
194  listEngines[i] = Se;
195  }
196  engines = listEngines;
197  }
198 
209  private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
210  if (engines == null) {
211  return null;
212  }
213  for (SearchEngine engine : engines) {
214  if (domain.contains(engine.getDomainSubstring())) {
215  return engine;
216  }
217  }
218  return null;
219  }
220 
228  private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
229  String x = ""; //NON-NLS
230 
231  for (KeyPair kp : eng.getKeys()) {
232  if (url.contains(kp.getKey())) {
233  x = getValue(url, kp.getKeyRegExp());
234  break;
235  }
236  }
237  try { //try to decode the url
238  String decoded = URLDecoder.decode(x, "UTF-8"); //NON-NLS
239  return decoded;
240  } catch (UnsupportedEncodingException exception) { //if it fails, return the encoded string
241  logger.log(Level.FINE, "Error during URL decoding, returning undecoded value:"
242  + "\n\tURL: " + url
243  + "\n\tUndecoded value: " + x
244  + "\n\tEngine name: " + eng.getEngineName()
245  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS
246  return x;
247  } catch (IllegalArgumentException exception) { //if it fails, return the encoded string
248  logger.log(Level.SEVERE, "Illegal argument passed to URL decoding, returning undecoded value:"
249  + "\n\tURL: " + url
250  + "\n\tUndecoded value: " + x
251  + "\n\tEngine name: " + eng.getEngineName()
252  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS)
253  return x;
254  }
255  }
256 
267  private String getValue(String url, String regExpKey) {
268  /*
269  * NOTE: This doesn't seem like the most wonderful way to do this, but
270  * we have data that has a bunch of bogus URLs. Such as: - Multiple
271  * google "q=" terms, including one after a "#" tag. Google used the
272  * last one - Search/query part of the URL starting with a '#'. Attemps
273  * at more formal approaches of splitting on the "?" and then on "&"
274  * resulting in missing things.
275  */
276  String value = ""; //NON-NLS
277  String v = regExpKey;
278  //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
279  if (regExpKey.contains("\\?")) {
280  v = regExpKey.replace("\\?", "?");
281  }
282  String[] sp = url.split(v);
283  if (sp.length >= 2) {
284  if (sp[sp.length - 1].contains("&")) {
285  value = sp[sp.length - 1].split("&")[0];
286  } else {
287  value = sp[sp.length - 1];
288  }
289  }
290  return value;
291  }
292 
293  private void findSearchQueries() {
294  int totalQueries = 0;
295  try {
296  //from blackboard_artifacts
297  Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts("WHERE (blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID() //NON-NLS
298  + "' OR blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() + "') "); //List of every 'web_history' and 'bookmark' artifact NON-NLS
299  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
300 
301  for (BlackboardArtifact artifact : listArtifacts) {
302  if (context.dataSourceIngestIsCancelled()) {
303  break; //User cancelled the process.
304  }
305 
306  //initializing default attributes
307  String query = "";
308  String searchEngineDomain = "";
309  String browser = "";
310  long last_accessed = -1;
311 
312  long fileId = artifact.getObjectID();
313  boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
314  if (!isFromSource) {
315  //File was from a different dataSource. Skipping.
316  continue;
317  }
318 
319  AbstractFile file = tskCase.getAbstractFileById(fileId);
320  if (file == null) {
321  continue;
322  }
323 
324  SearchEngineURLQueryAnalyzer.SearchEngine se = null;
325  //from blackboard_attributes
326  Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes("WHERE artifact_id = " + artifact.getArtifactID()); //NON-NLS
327 
328  for (BlackboardAttribute attribute : listAttributes) {
329  if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
330  final String urlString = attribute.getValueString();
331  se = getSearchEngineFromUrl(urlString);
332  if (se == null) {
333  break;
334  }
335 
336  query = extractSearchEngineQuery(se, attribute.getValueString());
337  if (query.equals("")) //False positive match, artifact was not a query. NON-NLS
338  {
339  break;
340  }
341 
342  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
343  browser = attribute.getValueString();
344  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
345  searchEngineDomain = attribute.getValueString();
346  } else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
347  last_accessed = attribute.getValueLong();
348  }
349  }
350 
351  if (se != null && !query.equals("")) { //NON-NLS
352  // If date doesn't exist, change to 0 (instead of 1969)
353  if (last_accessed == -1) {
354  last_accessed = 0;
355  }
356  Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
357  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
358  NbBundle.getMessage(this.getClass(),
359  "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
360  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
361  NbBundle.getMessage(this.getClass(),
362  "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
363  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
364  NbBundle.getMessage(this.getClass(),
365  "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
366  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
367  NbBundle.getMessage(this.getClass(),
368  "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
369  postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes));
370  se.increment();
371  ++totalQueries;
372  }
373  }
374  } catch (TskCoreException e) {
375  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
376  } finally {
377  if (context.dataSourceIngestIsCancelled()) {
378  logger.info("Operation terminated by user."); //NON-NLS
379  }
380  logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
381  }
382  }
383 
384  private String getTotals() {
385  String total = "";
386  if (engines == null) {
387  return total;
388  }
389  for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
390  total += se.getEngineName() + " : " + se.getTotal() + "\n";
391  }
392  return total;
393  }
394 
395  @Override
396  public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
397  this.dataSource = dataSource;
398  this.context = context;
399 
400  progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
401  this.findSearchQueries();
402  logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
403  }
404 
405  @Override
406  void configExtractor() throws IngestModuleException {
407  try {
408  PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
409  } catch (IOException e) {
410  String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
411  logger.log(Level.SEVERE, message, e);
412  throw new IngestModuleException(message, e);
413  }
414  loadConfigFile();
415  }
416 
417  @Override
418  public void complete() {
419  logger.info("Search Engine URL Query Analyzer has completed."); //NON-NLS
420  }
421 }
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition: XMLUtil.java:177

Copyright © 2012-2020 Basis Technology. Generated on: Wed Apr 8 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.