19 package org.sleuthkit.autopsy.recentactivity;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
41 import org.
sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
43 import org.
sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
46 import org.w3c.dom.Document;
47 import org.w3c.dom.NamedNodeMap;
48 import org.w3c.dom.NodeList;
49 import org.xml.sax.SAXException;
61 "cannotBuildXmlParser=Unable to build XML parser: ",
62 "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
63 "cannotParseXml=Unable to parse XML file: ",
64 "# {0} - file name",
"SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}.",
65 "Progress_Message_Find_Search_Query=Find Search Queries"
67 class SearchEngineURLQueryAnalyzer extends Extract {
69 private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
70 private static final String XMLFILE =
"SEUQAMappings.xml";
71 private static final String XSDFILE =
"SearchEngineSchema.xsd";
72 private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
74 private Content dataSource;
75 private IngestJobContext context;
77 SearchEngineURLQueryAnalyzer() {
78 moduleName = NbBundle.getMessage(ExtractIE.class,
"SearchEngineURLQueryAnalyzer.moduleName.text");
87 private final String
key;
90 KeyPair(String key, String keyRegExp) {
92 this.keyRegExp = keyRegExp;
99 String getKeyRegExp() {
112 SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
113 this.engineName = engineName;
114 this.domainSubstring = domainSubstring;
115 this.keyPairs = keyPairs;
123 String getEngineName() {
127 String getDomainSubstring() {
128 return domainSubstring;
140 List<KeyPair> getKeys() {
141 return this.keyPairs;
148 split = split +
"[ " + kp.getKey() +
" :: " + kp.getKeyRegExp() +
" ]" +
", ";
150 return NbBundle.getMessage(this.getClass(),
"SearchEngineURLQueryAnalyzer.toString",
151 engineName, domainSubstring, count, split);
159 File f =
new File(path);
160 logger.log(Level.INFO,
"Load successful");
161 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
162 DocumentBuilder db = dbf.newDocumentBuilder();
163 xmlinput = db.parse(f);
165 if (!
XMLUtil.
xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
166 logger.log(Level.WARNING,
"Error loading Search Engines: could not validate against [" + XSDFILE +
"], results may not be accurate.");
169 }
catch (IOException e) {
170 throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e);
171 }
catch (ParserConfigurationException pce) {
172 throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce);
173 }
catch (SAXException sxe) {
174 throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe);
177 NodeList nlist = xmlinput.getElementsByTagName(
"SearchEngine");
178 SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines =
new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
179 for (
int i = 0; i < nlist.getLength(); i++) {
180 NamedNodeMap nnm = nlist.item(i).getAttributes();
182 String EngineName = nnm.getNamedItem(
"engine").getNodeValue();
183 String EnginedomainSubstring = nnm.getNamedItem(
"domainSubstring").getNodeValue();
184 List<KeyPair> keys =
new ArrayList<>();
186 NodeList listSplits = xmlinput.getElementsByTagName(
"splitToken");
187 for (
int k = 0; k < listSplits.getLength(); k++) {
188 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem(
"engine").getNodeValue().equals(EngineName)) {
189 keys.add(
new KeyPair(listSplits.item(k).getAttributes().getNamedItem(
"plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem(
"regexToken").getNodeValue()));
193 SearchEngineURLQueryAnalyzer.SearchEngine Se =
new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
196 engines = listEngines;
209 private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
210 if (engines == null) {
213 for (SearchEngine engine : engines) {
214 if (domain.contains(engine.getDomainSubstring())) {
228 private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
231 for (KeyPair kp : eng.getKeys()) {
232 if (url.contains(kp.getKey())) {
233 x = getValue(url, kp.getKeyRegExp());
238 String decoded = URLDecoder.decode(x,
"UTF-8");
240 }
catch (UnsupportedEncodingException exception) {
241 logger.log(Level.FINE,
"Error during URL decoding, returning undecoded value:"
243 +
"\n\tUndecoded value: " + x
244 +
"\n\tEngine name: " + eng.getEngineName()
245 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
247 }
catch (IllegalArgumentException exception) {
248 logger.log(Level.SEVERE,
"Illegal argument passed to URL decoding, returning undecoded value:"
250 +
"\n\tUndecoded value: " + x
251 +
"\n\tEngine name: " + eng.getEngineName()
252 +
"\n\tEngine domain: " + eng.getDomainSubstring(), exception);
267 private String getValue(String url, String regExpKey) {
277 String v = regExpKey;
279 if (regExpKey.contains(
"\\?")) {
280 v = regExpKey.replace(
"\\?",
"?");
282 String[] sp = url.split(v);
283 if (sp.length >= 2) {
284 if (sp[sp.length - 1].contains(
"&")) {
285 value = sp[sp.length - 1].split(
"&")[0];
287 value = sp[sp.length - 1];
293 private void findSearchQueries() {
294 int totalQueries = 0;
297 Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts(
"WHERE (blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID()
298 +
"' OR blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() +
"') ");
299 logger.log(Level.INFO,
"Processing {0} blackboard artifacts.", listArtifacts.size());
301 for (BlackboardArtifact artifact : listArtifacts) {
302 if (context.dataSourceIngestIsCancelled()) {
308 String searchEngineDomain =
"";
310 long last_accessed = -1;
312 long fileId = artifact.getObjectID();
313 boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
319 AbstractFile file = tskCase.getAbstractFileById(fileId);
324 SearchEngineURLQueryAnalyzer.SearchEngine se = null;
326 Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes(
"WHERE artifact_id = " + artifact.getArtifactID());
328 for (BlackboardAttribute attribute : listAttributes) {
329 if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
330 final String urlString = attribute.getValueString();
331 se = getSearchEngineFromUrl(urlString);
336 query = extractSearchEngineQuery(se, attribute.getValueString());
337 if (query.equals(
""))
342 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
343 browser = attribute.getValueString();
344 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
345 searchEngineDomain = attribute.getValueString();
346 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
347 last_accessed = attribute.getValueLong();
351 if (se != null && !query.equals(
"")) {
353 if (last_accessed == -1) {
356 Collection<BlackboardAttribute> bbattributes =
new ArrayList<>();
357 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
358 NbBundle.getMessage(
this.getClass(),
359 "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
360 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
361 NbBundle.getMessage(
this.getClass(),
362 "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
363 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
364 NbBundle.getMessage(
this.getClass(),
365 "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
366 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
367 NbBundle.getMessage(
this.getClass(),
368 "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
369 postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes));
374 }
catch (TskCoreException e) {
375 logger.log(Level.SEVERE,
"Encountered error retrieving artifacts for search engine queries", e);
377 if (context.dataSourceIngestIsCancelled()) {
378 logger.info(
"Operation terminated by user.");
380 logger.log(Level.INFO,
"Extracted {0} queries from the blackboard", totalQueries);
384 private String getTotals() {
386 if (engines == null) {
389 for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
390 total += se.getEngineName() +
" : " + se.getTotal() +
"\n";
396 public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
397 this.dataSource = dataSource;
398 this.context = context;
400 progressBar.progress(Bundle.Progress_Message_Find_Search_Query());
401 this.findSearchQueries();
402 logger.log(Level.INFO,
"Search Engine stats: \n{0}", getTotals());
406 void configExtractor() throws IngestModuleException {
408 PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE,
true);
409 }
catch (IOException e) {
410 String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
411 logger.log(Level.SEVERE, message, e);
412 throw new IngestModuleException(message, e);
418 public void complete() {
419 logger.info(
"Search Engine URL Query Analyzer has completed.");
final String domainSubstring
final List< KeyPair > keyPairs
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)