19 package org.sleuthkit.autopsy.recentactivity;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.logging.Level;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32 import org.openide.util.NbBundle;
42 import org.
sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
44 import org.
sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
47 import org.w3c.dom.Document;
48 import org.w3c.dom.NamedNodeMap;
49 import org.w3c.dom.NodeList;
50 import org.xml.sax.SAXException;
62 "cannotBuildXmlParser=Unable to build XML parser: ",
63 "cannotLoadSEUQA=Unable to load Search Engine URL Query Analyzer settings file, SEUQAMappings.xml: ",
64 "cannotParseXml=Unable to parse XML file: ",
65 "# {0} - file name",
"SearchEngineURLQueryAnalyzer.init.exception.msg=Unable to find {0}."
67 class SearchEngineURLQueryAnalyzer extends Extract {
69 private static final Logger logger = Logger.getLogger(SearchEngineURLQueryAnalyzer.class.getName());
70 private static final String XMLFILE =
"SEUQAMappings.xml";
71 private static final String XSDFILE =
"SearchEngineSchema.xsd";
72 private static SearchEngineURLQueryAnalyzer.SearchEngine[] engines;
74 private Content dataSource;
75 private IngestJobContext context;
77 SearchEngineURLQueryAnalyzer() {
78 moduleName = NbBundle.getMessage(ExtractIE.class,
"SearchEngineURLQueryAnalyzer.moduleName.text");
87 private final String
key;
90 KeyPair(String key, String keyRegExp) {
92 this.keyRegExp = keyRegExp;
99 String getKeyRegExp() {
112 SearchEngine(String engineName, String domainSubstring, List<KeyPair> keyPairs) {
113 this.engineName = engineName;
114 this.domainSubstring = domainSubstring;
115 this.keyPairs = keyPairs;
123 String getEngineName() {
127 String getDomainSubstring() {
128 return domainSubstring;
140 List<KeyPair> getKeys() {
141 return this.keyPairs;
148 split = split +
"[ " + kp.getKey() +
" :: " + kp.getKeyRegExp() +
" ]" +
", ";
150 return NbBundle.getMessage(this.getClass(),
"SearchEngineURLQueryAnalyzer.toString",
151 engineName, domainSubstring, count, split);
159 File f =
new File(path);
160 logger.log(Level.INFO,
"Load successful");
161 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
162 DocumentBuilder db = dbf.newDocumentBuilder();
163 xmlinput = db.parse(f);
165 if (!
XMLUtil.
xmlIsValid(xmlinput, SearchEngineURLQueryAnalyzer.class, XSDFILE)) {
166 logger.log(Level.WARNING,
"Error loading Search Engines: could not validate against [" + XSDFILE +
"], results may not be accurate.");
169 }
catch (IOException e) {
170 throw new IngestModuleException(Bundle.cannotLoadSEUQA() + e.getLocalizedMessage(), e);
171 }
catch (ParserConfigurationException pce) {
172 throw new IngestModuleException(Bundle.cannotBuildXmlParser() + pce.getLocalizedMessage(), pce);
173 }
catch (SAXException sxe) {
174 throw new IngestModuleException(Bundle.cannotParseXml() + sxe.getLocalizedMessage(), sxe);
177 NodeList nlist = xmlinput.getElementsByTagName(
"SearchEngine");
178 SearchEngineURLQueryAnalyzer.SearchEngine[] listEngines =
new SearchEngineURLQueryAnalyzer.SearchEngine[nlist.getLength()];
179 for (
int i = 0; i < nlist.getLength(); i++) {
180 NamedNodeMap nnm = nlist.item(i).getAttributes();
182 String EngineName = nnm.getNamedItem(
"engine").getNodeValue();
183 String EnginedomainSubstring = nnm.getNamedItem(
"domainSubstring").getNodeValue();
184 List<KeyPair> keys =
new ArrayList<>();
186 NodeList listSplits = xmlinput.getElementsByTagName(
"splitToken");
187 for (
int k = 0; k < listSplits.getLength(); k++) {
188 if (listSplits.item(k).getParentNode().getAttributes().getNamedItem(
"engine").getNodeValue().equals(EngineName)) {
189 keys.add(
new KeyPair(listSplits.item(k).getAttributes().getNamedItem(
"plainToken").getNodeValue(), listSplits.item(k).getAttributes().getNamedItem(
"regexToken").getNodeValue()));
193 SearchEngineURLQueryAnalyzer.SearchEngine Se =
new SearchEngineURLQueryAnalyzer.SearchEngine(EngineName, EnginedomainSubstring, keys);
196 engines = listEngines;
209 private static SearchEngineURLQueryAnalyzer.SearchEngine getSearchEngineFromUrl(String domain) {
210 if (engines == null) {
213 for (SearchEngine engine : engines) {
214 if (domain.contains(engine.getDomainSubstring())) {
228 private String extractSearchEngineQuery(SearchEngineURLQueryAnalyzer.SearchEngine eng, String url) {
231 for (KeyPair kp : eng.getKeys()) {
232 if (url.contains(kp.getKey())) {
233 x = getValue(url, kp.getKeyRegExp());
238 String decoded = URLDecoder.decode(x,
"UTF-8");
240 }
catch (UnsupportedEncodingException uee) {
241 logger.log(Level.FINE,
"Error during URL decoding ", uee);
256 private String getValue(String url, String regExpKey) {
266 String v = regExpKey;
268 if (regExpKey.contains(
"\\?")) {
269 v = regExpKey.replace(
"\\?",
"?");
271 String[] sp = url.split(v);
272 if (sp.length >= 2) {
273 if (sp[sp.length - 1].contains(
"&")) {
274 value = sp[sp.length - 1].split(
"&")[0];
276 value = sp[sp.length - 1];
282 private void findSearchQueries() {
283 int totalQueries = 0;
286 Collection<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getMatchingArtifacts(
"WHERE (blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_BOOKMARK.getTypeID()
287 +
"' OR blackboard_artifacts.artifact_type_id = '" + ARTIFACT_TYPE.TSK_WEB_HISTORY.getTypeID() +
"') ");
288 logger.log(Level.INFO,
"Processing {0} blackboard artifacts.", listArtifacts.size());
290 for (BlackboardArtifact artifact : listArtifacts) {
291 if (context.dataSourceIngestIsCancelled()) {
297 String searchEngineDomain =
"";
299 long last_accessed = -1;
301 long fileId = artifact.getObjectID();
302 boolean isFromSource = tskCase.isFileFromSource(dataSource, fileId);
308 AbstractFile file = tskCase.getAbstractFileById(fileId);
313 SearchEngineURLQueryAnalyzer.SearchEngine se = null;
315 Collection<BlackboardAttribute> listAttributes = currentCase.getSleuthkitCase().getMatchingAttributes(
"WHERE artifact_id = " + artifact.getArtifactID());
317 for (BlackboardAttribute attribute : listAttributes) {
318 if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL.getTypeID()) {
319 final String urlString = attribute.getValueString();
320 se = getSearchEngineFromUrl(urlString);
325 query = extractSearchEngineQuery(se, attribute.getValueString());
326 if (query.equals(
""))
331 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PROG_NAME.getTypeID()) {
332 browser = attribute.getValueString();
333 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN.getTypeID()) {
334 searchEngineDomain = attribute.getValueString();
335 }
else if (attribute.getAttributeType().getTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID()) {
336 last_accessed = attribute.getValueLong();
340 if (se != null && !query.equals(
"")) {
342 if (last_accessed == -1) {
345 Collection<BlackboardAttribute> bbattributes =
new ArrayList<>();
346 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN,
347 NbBundle.getMessage(
this.getClass(),
348 "SearchEngineURLQueryAnalyzer.parentModuleName"), searchEngineDomain));
349 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_TEXT,
350 NbBundle.getMessage(
this.getClass(),
351 "SearchEngineURLQueryAnalyzer.parentModuleName"), query));
352 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_PROG_NAME,
353 NbBundle.getMessage(
this.getClass(),
354 "SearchEngineURLQueryAnalyzer.parentModuleName"), browser));
355 bbattributes.add(
new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED,
356 NbBundle.getMessage(
this.getClass(),
357 "SearchEngineURLQueryAnalyzer.parentModuleName"), last_accessed));
358 this.addArtifact(ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY, file, bbattributes);
363 }
catch (TskCoreException e) {
364 logger.log(Level.SEVERE,
"Encountered error retrieving artifacts for search engine queries", e);
366 if (context.dataSourceIngestIsCancelled()) {
367 logger.info(
"Operation terminated by user.");
369 IngestServices.getInstance().fireModuleDataEvent(
new ModuleDataEvent(
370 NbBundle.getMessage(
this.getClass(),
"SearchEngineURLQueryAnalyzer.parentModuleName.noSpace"),
371 BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY));
372 logger.log(Level.INFO,
"Extracted {0} queries from the blackboard", totalQueries);
376 private String getTotals() {
378 if (engines == null) {
381 for (SearchEngineURLQueryAnalyzer.SearchEngine se : engines) {
382 total += se.getEngineName() +
" : " + se.getTotal() +
"\n";
388 public void process(Content dataSource, IngestJobContext context) {
389 this.dataSource = dataSource;
390 this.context = context;
391 this.findSearchQueries();
392 logger.log(Level.INFO,
"Search Engine stats: \n{0}", getTotals());
396 void configExtractor() throws IngestModuleException {
398 PlatformUtil.extractResourceToUserConfigDir(SearchEngineURLQueryAnalyzer.class, XMLFILE,
true);
399 }
catch (IOException e) {
400 String message = Bundle.SearchEngineURLQueryAnalyzer_init_exception_msg(XMLFILE);
401 logger.log(Level.SEVERE, message, e);
402 throw new IngestModuleException(message, e);
408 public void complete() {
409 logger.info(
"Search Engine URL Query Analyzer has completed.");
final String domainSubstring
final List< KeyPair > keyPairs
static< T > boolean xmlIsValid(DOMSource xmlfile, Class< T > clazz, String schemaFile)