Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
SearchEngineURLQueryAnalyzer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.Arrays;
26 import java.util.ArrayList;
27 import java.util.Collection;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33 import java.util.Set;
34 import javax.xml.parsers.DocumentBuilder;
35 import javax.xml.parsers.DocumentBuilderFactory;
36 import javax.xml.parsers.ParserConfigurationException;
37 import org.openide.util.NbBundle;
44 import org.sleuthkit.datamodel.AbstractFile;
45 import org.sleuthkit.datamodel.BlackboardArtifact;
46 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
47 import org.sleuthkit.datamodel.BlackboardAttribute;
48 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
49 import org.sleuthkit.datamodel.Content;
50 import org.sleuthkit.datamodel.TskCoreException;
51 import org.w3c.dom.Document;
52 import org.w3c.dom.NamedNodeMap;
53 import org.w3c.dom.NodeList;
54 import org.xml.sax.SAXException;
55 
63 @NbBundle.Messages({
64  "cannotBuildXmlParser=Unable to build XML parser: ",
65  "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
66  "cannotParseXml=Unable to parse XML file: ",
67  "# {0} - file name", "SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
68  "Progress_Message_Find_Search_Query=Find Search Queries"
69 })
70 class SearchEngineURLQueryAnalyzer extends Extract {
71 
72  private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
73  private static final String XMLFILE = "SEUQAMappings.xml"; //NON-NLS
74  private static final String XSDFILE = "SearchEngineSchema.xsd"; //NON-NLS
75  private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
76 
77  private Content dataSource;
78  private final IngestJobContext context;
79 
80  SearchEngineURLQueryAnalyzer(IngestJobContext context) {
81  super(NbBundle.getMessage(ExtractIE.class, "SearchEngineURLQueryAnalyzer.moduleName.text"), context);
82  this.context = context;
83  }
84 
89  private static class KeyPair {
90 
91  private final String key;
92  private final String keyRegExp;
93 
94  KeyPair(String key, String keyRegExp) {
95  this.key = key;
96  this.keyRegExp = keyRegExp;
97  }
98 
99  String getKey() {
100  return key;
101  }
102 
103  String getKeyRegExp() {
104  return keyRegExp;
105  }
106 
107  }
108 
109  private static class SearchEngine {
110 
111  private final String engineName;
112  private final String domainSubstring;
113  private final List<KeyPair> keyPairs;
114  private final Pattern domainRegexPattern;
115  private int count;
116 
117  SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
118  this.engineName = engineName;
119  this.domainSubstring = domainSubstring;
120  domainRegexPattern = Pattern.compile("^(.*[./])?" + domainSubstring + "([./].*)?$");
121  this.keyPairs = keyPairs;
122  count = 0;
123  }
124 
125  void increment() {
126  ++count;
127  }
128 
129  String getEngineName() {
130  return engineName;
131  }
132 
133  String getDomainSubstring() {
134  return domainSubstring;
135  }
136 
137  Pattern getDomainRegexPattern() {
138  return domainRegexPattern;
139  }
140 
141  int getTotal() {
142  return count;
143  }
144 
150  List<KeyPair> getKeys() {
151  return this.keyPairs;
152  }
153 
154  @Override
155  public String toString() {
156  String split = " ";
157  for (KeyPair kp : keyPairs) {
158  split = split + "[ " + kp.getKey() + " :: " + kp.getKeyRegExp() + " ]" + ", ";
159  }
160  return NbBundle.getMessage(this.getClass(), "SearchEngineURLQueryAnalyzer.toString",
161  engineName, domainSubstring, count, split);
162  }
163  }
164 
165  private void loadConfigFile() throws IngestModuleException {
166  Document xmlinput;
167  try {
168  String path = PlatformUtil.getUserConfigDirectory() + File.separator + XMLFILE;
169  File f = new File(path);
170  logger.log(Level.INFO, "Load successful"); //NON-NLS
171  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
172  DocumentBuilder db = dbf.newDocumentBuilder();
173  xmlinput = db.parse(f);
174 
175  if (!XMLUtil.xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
176  logger.log(Level.WARNING, "Error loading Search Engines: could not validate against [" + XSDFILE + "], results may not be accurate."); //NON-NLS
177  }
178 
179  } catch (IOException e) {
180  throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e); //NON-NLS
181  } catch (ParserConfigurationException pce) {
182  throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce); //NON-NLS
183  } catch (SAXException sxe) {
184  throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe); //NON-NLS
185  }
186 
187  NodeList nlist = xmlinput.getElementsByTagName("SearchEngine"); //NON-NLS
188  SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines = new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
189  for (int i = 0; i < nlist.getLength(); i++) {
190  NamedNodeMap nnm = nlist.item(i).getAttributes();
191 
192  String EngineName = nnm.getNamedItem("engine").getNodeValue(); //NON-NLS
193  String EnginedomainSubstring = nnm.getNamedItem("domainSubstring").getNodeValue(); //NON-NLS
194  List<KeyPair> keys = new ArrayList<>();
195 
196  NodeList listSplits = xmlinput.getElementsByTagName("splitToken"); //NON-NLS
197  for (int k = 0; k < listSplits.getLength(); k++) {
198  if (listSplits.item(k).getParentNode().getAttributes().getNamedItem("engine").getNodeValue().equals(EngineName)) { //NON-NLS
199  keys.add(new KeyPair(listSplits.item(k).getAttributes().getNamedItem("plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem("regexToken").getNodeValue())); //NON-NLS
200  }
201  }
202 
203  SearchEngineURLQueryAnalyzer.SearchEngine Se = new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
204  listEngines[i] = Se;
205  }
206  engines = listEngines;
207  }
208 
219  private static Collection<SearchEngineURLQueryAnalyzer.SearchEngine> getSearchEngineFromUrl(String domain) {
220  List<SearchEngineURLQueryAnalyzer.SearchEngine> supportedEngines = new ArrayList<>();
221  if (engines == null) {
222  return supportedEngines;
223  }
224  for (SearchEngine engine : engines) {
225  Matcher matcher = engine.getDomainRegexPattern().matcher(domain);
226  if (matcher.matches()) {
227  supportedEngines.add(engine);
228  }
229  }
230  return supportedEngines;
231  }
232 
240  private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
241  String x = ""; //NON-NLS
242 
243  for (KeyPair kp : eng.getKeys()) {
244  if (url.contains(kp.getKey())) {
245  x = getValue(url, kp.getKeyRegExp());
246  break;
247  }
248  }
249  try { //try to decode the url
250  String decoded = URLDecoder.decode(x.replaceAll("%(?![0-9a-fA-F]{2})", "%25"), "UTF-8"); //NON-NLS
251  return decoded;
252  } catch (UnsupportedEncodingException exception) { //if it fails, return the encoded string
253  logger.log(Level.FINE, "Error during URL decoding, returning undecoded value:"
254  + "\n\tURL: " + url
255  + "\n\tUndecoded value: " + x
256  + "\n\tEngine name: " + eng.getEngineName()
257  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS
258  return x;
259  } catch (IllegalArgumentException exception) { //if it fails, return the encoded string
260  logger.log(Level.SEVERE, "Illegal argument passed to URL decoding, returning undecoded value:"
261  + "\n\tURL: " + url
262  + "\n\tUndecoded value: " + x
263  + "\n\tEngine name: " + eng.getEngineName()
264  + "\n\tEngine domain: " + eng.getDomainSubstring(), exception); //NON-NLS)
265  return x;
266  }
267  }
268 
279  private String getValue(String url, String regExpKey) {
280  /*
281  * NOTE: This doesn't seem like the most wonderful way to do this, but
282  * we have data that has a bunch of bogus URLs. Such as: - Multiple
283  * google "q=" terms, including one after a "#" tag. Google used the
284  * last one - Search/query part of the URL starting with a '#'. Attemps
285  * at more formal approaches of splitting on the "?" and then on "&"
286  * resulting in missing things.
287  */
288  String value = ""; //NON-NLS
289  String v = regExpKey;
290  //Want to determine if string contains a string based on splitkey, but we want to split the string on splitKeyConverted due to regex
291  if (regExpKey.contains("\\?")) {
292  v = regExpKey.replace("\\?", "?");
293  }
294  String[] sp = url.split(v);
295  if (sp.length >= 2) {
296  if (sp[sp.length - 1].contains("&")) {
297  value = sp[sp.length - 1].split("&")[0];
298  } else {
299  value = sp[sp.length - 1];
300  }
301  }
302  return value;
303  }
304 
305  private void findSearchQueries() {
306  int totalQueries = 0;
307  try {
308  //from blackboard_artifacts
309  Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
310  Arrays.asList(new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_BOOKMARK), new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
311  Arrays.asList(dataSource.getId()));
312  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
313 
314  for (BlackboardArtifact artifact : listArtifacts) {
315  if (context.dataSourceIngestIsCancelled()) {
316  break; //User cancelled the process.
317  }
318 
319  //initializing default attributes
320  String searchEngineDomain = "";
321  String browser = "";
322  long last_accessed = -1;
323 
324  AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
325  if (file == null) {
326  continue;
327  }
328 
329  // Try search engines on the URL to see if any produce a search string
330  Set<String> searchQueries = new HashSet<>();
331  BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
332  if (urlAttr == null) {
333  continue;
334  }
335 
336  final String urlString = urlAttr.getValueString();
337  Collection<SearchEngineURLQueryAnalyzer.SearchEngine> possibleSearchEngines = getSearchEngineFromUrl(urlString);
338  for (SearchEngineURLQueryAnalyzer.SearchEngine se : possibleSearchEngines) {
339  String query = extractSearchEngineQuery(se, urlString);
340  // If we have a non-empty query string, add it to the list
341  if (!query.equals("")) {
342  searchQueries.add(query);
343  se.increment();
344  }
345  }
346 
347  // If we didn't extract any search queries, go on to the next artifact
348  if (searchQueries.isEmpty()) {
349  continue;
350  }
351 
352  // Extract the rest of the fields needed for the web search artifact
353  BlackboardAttribute browserAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME));
354  if (browserAttr != null) {
355  browser = browserAttr.getValueString();
356  }
357  BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
358  if (domainAttr != null) {
359  searchEngineDomain = domainAttr.getValueString();
360  }
361  BlackboardAttribute lastAccessAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED));
362  if (lastAccessAttr != null) {
363  last_accessed = lastAccessAttr.getValueLong();
364  }
365 
366  // Make an artifact for each distinct query
367  for (String query : searchQueries) {
368  // If date doesn't exist, change to 0 (instead of 1969)
369  if (last_accessed == -1) {
370  last_accessed = 0;
371  }
372  Collection<BlackboardAttribute> bbattributes = new ArrayList<>();
373  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
374  NbBundle.getMessage(this.getClass(),
375  "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
376  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
377  NbBundle.getMessage(this.getClass(),
378  "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
379  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
380  NbBundle.getMessage(this.getClass(),
381  "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
382  bbattributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
383  NbBundle.getMessage(this.getClass(),
384  "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
385  postArtifact(createArtifactWithAttributes(BlackboardArtifact.Type.TSK_WEB_SEARCH_QUERY, file, bbattributes));
386  ++totalQueries;
387  }
388  }
389  } catch (TskCoreException e) {
390  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for search engine queries", e); //NON-NLS
391  } finally {
392  if (context.dataSourceIngestIsCancelled()) {
393  logger.info("Operation terminated by user."); //NON-NLS
394  }
395  logger.log(Level.INFO, "Extracted {0} queries from the blackboard", totalQueries); //NON-NLS
396  }
397  }
398 
399  private String getTotals() {
400  String total = "";
401  if (engines == null) {
402  return total;
403  }
404  for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
405  total += se.getEngineName() + " : " + se.getTotal() + "\n";
406  }
407  return total;
408  }
409 
410  @Override
411  public void process(Content dataSource, DataSourceIngestModuleProgress progressBar) {
412  this.dataSource = dataSource;
413 
414  progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
415  this.findSearchQueries();
416  logger.log(Level.INFO, "Search Engine stats: \n{0}", getTotals()); //NON-NLS
417  }
418 
419  @Override
420  void startUp() throws IngestModuleException {
421  try {
422  PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE, true);
423  } catch (IOException e) {
424  String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
425  logger.log(Level.SEVERE, message, e);
426  throw new IngestModuleException(message, e);
427  }
428  loadConfigFile();
429  }
430 
431 }
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
Definition: XMLUtil.java:213

Copyright © 2012-2022 Basis Technology. Generated on: Tue Feb 6 2024
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.