19 package org.sleuthkit.autopsy.recentactivity;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
43 import org.
sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
45 import org.
sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
48 import org.w3c.dom.Document;
49 import org.w3c.dom.NamedNodeMap;
50 import org.w3c.dom.NodeList;
51 import org.xml.sax.SAXException;
63 "cannotBuildXmlParser=Unable to build XML parser: ",
64 "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
65 "cannotParseXml=Unable to parse XML file: ",
66 "# {0} - file name",
"SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
67 "Progress_Message_Find_Search_Query=Find Search Queries"
69 class SearchEngineURLQueryAnalyzer extends Extract {
71 private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
72 private static final String XMLFILE =
"SEUQAMappings.xml";
73 private static final String XSDFILE =
"SearchEngineSchema.xsd";
74 private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
76 private Content dataSource;
77 private IngestJobContext context;
79 SearchEngineURLQueryAnalyzer() {
80 moduleName = NbBundle.getMessage(ExtractIE.class,
"SearchEngineURLQueryAnalyzer.moduleName.text");
89 private final String
key;
92 KeyPair(String key, String keyRegExp) {
94 this.keyRegExp = keyRegExp;
101 String getKeyRegExp() {
114 SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
115 this.engineName = engineName;
116 this.domainSubstring = domainSubstring;
117 this.keyPairs = keyPairs;
125 String getEngineName() {
129 String getDomainSubstring() {
130 return domainSubstring;
142 List<KeyPair> getKeys() {
143 return this.keyPairs;
150 split = split +
"[ " + kp.getKey() +
" :: " + kp.getKeyRegExp() +
" ]" +
", ";
152 return NbBundle.getMessage(this.getClass(),
"SearchEngineURLQueryAnalyzer.toString",
153 engineName, domainSubstring, count, split);
161 File f =
new File(path);
162 logger.log(Level.INFO,
"Load successful");
163 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
164 DocumentBuilder db = dbf.newDocumentBuilder();
165 xmlinput = db.parse(f);
167 if (!
XMLUtil.
xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
168 logger.log(Level.WARNING,
"Error loading Search Engines: could not validate against [" + XSDFILE +
"], results may not be accurate.");
171 }
catch (IOException e) {
172 throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e);
173 }
catch (ParserConfigurationException pce) {
174 throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce);
175 }
catch (SAXException sxe) {
176 throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe);
179 NodeList nlist = xmlinput.getElementsByTagName(
"SearchEngine");
180 SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines =
new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
181 for (
int i = 0; i < nlist.getLength(); i++) {
182 NamedNodeMap nnm = nlist.item(i).getAttributes();
184 String EngineName = nnm.getNamedItem(
"engine").getNodeValue();
185 String EnginedomainSubstring = nnm.getNamedItem(
"domainSubstring").getNodeValue();
186 List<KeyPair> keys =
new ArrayList<>();
188 NodeList listSplits = xmlinput.getElementsByTagName(
"splitToken");
189 for (
int k = 0; k < listSplits.getLength(); k++) {
190 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem(
"engine").getNodeValue().equals(EngineName)) {
191 keys.add(
new KeyPair(listSplits.item(k).getAttributes().getNamedItem(
"plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem(
"regexToken").getNodeValue()));
195 SearchEngineURLQueryAnalyzer.SearchEngine Se =
new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
198 engines = listEngines;
211 private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
212 if (engines == null) {
215 for (SearchEngine engine : engines) {
216 if (domain.contains(engine.getDomainSubstring())) {
230 private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
233 for (KeyPair kp : eng.getKeys()) {
234 if (url.contains(kp.getKey())) {
235 x = getValue(url, kp.getKeyRegExp());
240 String decoded = URLDecoder.decode(x,
"UTF-8");
242 }
catch (UnsupportedEncodingException exception) {
243 logger.log(Level.FINE,
"Error during URL decoding, returning undecoded value:"
245 +
"\n\tUndecoded value: " + x
246 +
"\n\tEngine name: " + eng.getEngineName()
247 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
249 }
catch (IllegalArgumentException exception) {
250 logger.log(Level.SEVERE,
"Illegal argument passed to URL decoding, returning undecoded value:"
252 +
"\n\tUndecoded value: " + x
253 +
"\n\tEngine name: " + eng.getEngineName()
254 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
269 private String getValue(String url, String regExpKey) {
279 String v = regExpKey;
281 if (regExpKey.contains(
"\\?")) {
282 v = regExpKey.replace(
"\\?",
"?");
284 String[] sp = url.split(v);
285 if (sp.length >= 2) {
286 if (sp[sp.length - 1].contains(
"&")) {
287 value = sp[sp.length - 1].split(
"&")[0];
289 value = sp[sp.length - 1];
295 private void findSearchQueries() {
296 int totalQueries = 0;
299 Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts(
"WHERE (blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID()
300 +
"' OR blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() +
"') ");
301 logger.log(Level.INFO,
"Processing {0} blackboard artifacts.", listArtifacts.size());
303 for (BlackboardArtifact artifact : listArtifacts) {
304 if (context.dataSourceIngestIsCancelled()) {
310 String searchEngineDomain =
"";
312 long last_accessed = -1;
314 long fileId = artifact.getObjectID();
315 boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
321 AbstractFile file = tskCase.getAbstractFileById(fileId);
326 SearchEngineURLQueryAnalyzer.SearchEngine se = null;
328 Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes(
"WHERE artifact_id = " + artifact.getArtifactID());
330 for (BlackboardAttribute attribute : listAttributes) {
331 if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
332 final String urlString = attribute.getValueString();
333 se = getSearchEngineFromUrl(urlString);
338 query = extractSearchEngineQuery(se, attribute.getValueString());
339 if (query.equals(
""))
344 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
345 browser = attribute.getValueString();
346 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
347 searchEngineDomain = attribute.getValueString();
348 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
349 last_accessed = attribute.getValueLong();
353 if (se != null && !query.equals(
"")) {
355 if (last_accessed == -1) {
358 Collection<BlackboardAttribute> bbattributes =
new ArrayList<>();
359 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
360 NbBundle.getMessage(
this.getClass(),
361 "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
362 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
363 NbBundle.getMessage(
this.getClass(),
364 "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
365 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
366 NbBundle.getMessage(
this.getClass(),
367 "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
368 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
369 NbBundle.getMessage(
this.getClass(),
370 "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
371 this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes);
376 }
catch (TskCoreException e) {
377 logger.log(Level.SEVERE,
"Encountered error retrieving artifacts for search engine queries", e);
379 if (context.dataSourceIngestIsCancelled()) {
380 logger.info(
"Operation terminated by user.");
382 IngestServices.getInstance().fireModuleDataEvent(
new ModuleDataEvent(
383 NbBundle.getMessage(
this.getClass(),
"SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"),
384 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY));
385 logger.log(Level.INFO,
"Extracted {0} queries from the blackboard", totalQueries);
389 private String getTotals() {
391 if (engines == null) {
394 for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
395 total += se.getEngineName() +
" : " + se.getTotal() +
"\n";
401 public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
402 this.dataSource = dataSource;
403 this.context = context;
405 progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
406 this.findSearchQueries();
407 logger.log(Level.INFO,
"Search Engine stats: \n{0}", getTotals());
411 void configExtractor() throws IngestModuleException {
413 PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE,
true);
414 }
catch (IOException e) {
415 String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
416 logger.log(Level.SEVERE, message, e);
417 throw new IngestModuleException(message, e);
423 public void complete() {
424 logger.info(
"Search Engine URL Query Analyzer has completed.");
final String domainSubstring
final List< KeyPair > keyPairs
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)