19 package org.sleuthkit.autopsy.recentactivity;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.Arrays;
26 import java.util.ArrayList;
27 import java.util.Collection;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
34 import javax.xml.parsers.DocumentBuilder;
35 import javax.xml.parsers.DocumentBuilderFactory;
36 import javax.xml.parsers.ParserConfigurationException;
37 import org.openide.util.NbBundle;
46 import org.
sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
48 import org.
sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
51 import org.w3c.dom.Document;
52 import org.w3c.dom.NamedNodeMap;
53 import org.w3c.dom.NodeList;
54 import org.xml.sax.SAXException;
66 "cannotBuildXmlParser=Unable to build XML parser: ",
67 "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
68 "cannotParseXml=Unable to parse XML file: ",
69 "# {0} - file name",
"SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
70 "Progress_Message_Find_Search_Query=Find Search Queries"
72 class SearchEngineURLQueryAnalyzer extends Extract {
74 private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
75 private static final String XMLFILE =
"SEUQAMappings.xml";
76 private static final String XSDFILE =
"SearchEngineSchema.xsd";
77 private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
79 private Content dataSource;
80 private IngestJobContext context;
82 SearchEngineURLQueryAnalyzer() {
83 super(NbBundle.getMessage(ExtractIE.class,
"SearchEngineURLQueryAnalyzer.moduleName.text"));
92 private final String
key;
95 KeyPair(String key, String keyRegExp) {
97 this.keyRegExp = keyRegExp;
104 String getKeyRegExp() {
118 SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
119 this.engineName = engineName;
120 this.domainSubstring = domainSubstring;
121 domainRegexPattern = Pattern.compile(
"^(.*[./])?" + domainSubstring +
"([./].*)?$");
122 this.keyPairs = keyPairs;
130 String getEngineName() {
134 String getDomainSubstring() {
135 return domainSubstring;
138 Pattern getDomainRegexPattern() {
139 return domainRegexPattern;
151 List<KeyPair> getKeys() {
152 return this.keyPairs;
159 split = split +
"[ " + kp.getKey() +
" :: " + kp.getKeyRegExp() +
" ]" +
", ";
161 return NbBundle.getMessage(this.getClass(),
"SearchEngineURLQueryAnalyzer.toString",
162 engineName, domainSubstring, count, split);
170 File f =
new File(path);
171 logger.log(Level.INFO,
"Load successful");
172 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
173 DocumentBuilder db = dbf.newDocumentBuilder();
174 xmlinput = db.parse(f);
176 if (!
XMLUtil.
xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
177 logger.log(Level.WARNING,
"Error loading Search Engines: could not validate against [" + XSDFILE +
"], results may not be accurate.");
180 }
catch (IOException e) {
181 throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e);
182 }
catch (ParserConfigurationException pce) {
183 throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce);
184 }
catch (SAXException sxe) {
185 throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe);
188 NodeList nlist = xmlinput.getElementsByTagName(
"SearchEngine");
189 SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines =
new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
190 for (
int i = 0; i < nlist.getLength(); i++) {
191 NamedNodeMap nnm = nlist.item(i).getAttributes();
193 String EngineName = nnm.getNamedItem(
"engine").getNodeValue();
194 String EnginedomainSubstring = nnm.getNamedItem(
"domainSubstring").getNodeValue();
195 List<KeyPair> keys =
new ArrayList<>();
197 NodeList listSplits = xmlinput.getElementsByTagName(
"splitToken");
198 for (
int k = 0; k < listSplits.getLength(); k++) {
199 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem(
"engine").getNodeValue().equals(EngineName)) {
200 keys.add(
new KeyPair(listSplits.item(k).getAttributes().getNamedItem(
"plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem(
"regexToken").getNodeValue()));
204 SearchEngineURLQueryAnalyzer.SearchEngine Se =
new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
207 engines = listEngines;
219 private static Collection<SearchEngineURLQueryAnalyzer.SearchEngine> getSearchEngineFromUrl(String domain) {
220 List<SearchEngineURLQueryAnalyzer.SearchEngine> supportedEngines =
new ArrayList<>();
221 if (engines == null) {
222 return supportedEngines;
224 for (SearchEngine engine : engines) {
225 Matcher matcher = engine.getDomainRegexPattern().matcher(domain);
226 if (matcher.matches()) {
227 supportedEngines.add(engine);
230 return supportedEngines;
240 private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
243 for (KeyPair kp : eng.getKeys()) {
244 if (url.contains(kp.getKey())) {
245 x = getValue(url, kp.getKeyRegExp());
250 String decoded = URLDecoder.decode(x.replaceAll(
"%(?![0-9a-fA-F]{2})",
"%25"),
"UTF-8");
252 }
catch (UnsupportedEncodingException exception) {
253 logger.log(Level.FINE,
"Error during URL decoding, returning undecoded value:"
255 +
"\n\tUndecoded value: " + x
256 +
"\n\tEngine name: " + eng.getEngineName()
257 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
259 }
catch (IllegalArgumentException exception) {
260 logger.log(Level.SEVERE,
"Illegal argument passed to URL decoding, returning undecoded value:"
262 +
"\n\tUndecoded value: " + x
263 +
"\n\tEngine name: " + eng.getEngineName()
264 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
279 private String getValue(String url, String regExpKey) {
289 String v = regExpKey;
291 if (regExpKey.contains(
"\\?")) {
292 v = regExpKey.replace(
"\\?",
"?");
294 String[] sp = url.split(v);
295 if (sp.length >= 2) {
296 if (sp[sp.length - 1].contains(
"&")) {
297 value = sp[sp.length - 1].split(
"&")[0];
299 value = sp[sp.length - 1];
305 private void findSearchQueries() {
306 int totalQueries = 0;
309 Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
310 Arrays.asList(
new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_BOOKMARK),
new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
311 Arrays.asList(dataSource.getId()));
312 logger.log(Level.INFO,
"Processing {0} blackboard artifacts.", listArtifacts.size());
314 for (BlackboardArtifact artifact : listArtifacts) {
315 if (context.dataSourceIngestIsCancelled()) {
320 String searchEngineDomain =
"";
322 long last_accessed = -1;
324 AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
330 Set<String> searchQueries =
new HashSet<>();
331 BlackboardAttribute urlAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
332 if (urlAttr == null) {
336 final String urlString = urlAttr.getValueString();
337 Collection<SearchEngineURLQueryAnalyzer.SearchEngine> possibleSearchEngines = getSearchEngineFromUrl(urlString);
338 for (SearchEngineURLQueryAnalyzer.SearchEngine se : possibleSearchEngines) {
339 String query = extractSearchEngineQuery(se, urlString);
341 if ( !query.equals(
"")) {
342 searchQueries.add(query);
348 if (searchQueries.isEmpty()) {
353 BlackboardAttribute browserAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME));
354 if (browserAttr != null) {
355 browser = browserAttr.getValueString();
357 BlackboardAttribute domainAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
358 if (domainAttr != null) {
359 searchEngineDomain = domainAttr.getValueString();
361 BlackboardAttribute lastAccessAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED));
362 if (lastAccessAttr != null) {
363 last_accessed = lastAccessAttr.getValueLong();
367 for (String query : searchQueries) {
369 if (last_accessed == -1) {
372 Collection<BlackboardAttribute> bbattributes =
new ArrayList<>();
373 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
374 NbBundle.getMessage(
this.getClass(),
375 "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
376 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
377 NbBundle.getMessage(
this.getClass(),
378 "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
379 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
380 NbBundle.getMessage(
this.getClass(),
381 "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
382 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
383 NbBundle.getMessage(
this.getClass(),
384 "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
385 postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes));
389 }
catch (TskCoreException e) {
390 logger.log(Level.SEVERE,
"Encountered error retrieving artifacts for search engine queries", e);
392 if (context.dataSourceIngestIsCancelled()) {
393 logger.info(
"Operation terminated by user.");
395 logger.log(Level.INFO,
"Extracted {0} queries from the blackboard", totalQueries);
399 private String getTotals() {
401 if (engines == null) {
404 for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
405 total += se.getEngineName() +
" : " + se.getTotal() +
"\n";
411 public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
412 this.dataSource = dataSource;
413 this.context = context;
415 progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
416 this.findSearchQueries();
417 logger.log(Level.INFO,
"Search Engine stats: \n{0}", getTotals());
421 void configExtractor() throws IngestModuleException {
423 PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE,
true);
424 }
catch (IOException e) {
425 String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
426 logger.log(Level.SEVERE, message, e);
427 throw new IngestModuleException(message, e);
433 public void complete() {
434 logger.info(
"Search Engine URL Query Analyzer has completed.");
final String domainSubstring
final List< KeyPair > keyPairs
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)
final Pattern domainRegexPattern