Autopsy  4.20.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
DomainCategoryRunner.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2020-2021 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.recentactivity;
20 
21 import java.net.MalformedURLException;
22 import java.net.URL;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Collection;
26 import java.util.Collections;
27 import java.util.Comparator;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.logging.Level;
32 import java.util.Set;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35 import java.util.stream.Collectors;
36 import java.util.stream.Stream;
37 import org.apache.commons.lang.StringUtils;
38 import org.openide.util.Lookup;
39 import org.openide.util.NbBundle.Messages;
45 import org.sleuthkit.datamodel.AbstractFile;
46 import org.sleuthkit.datamodel.BlackboardArtifact;
47 import org.sleuthkit.datamodel.BlackboardAttribute;
48 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
49 import org.sleuthkit.datamodel.Content;
50 import org.sleuthkit.datamodel.TskCoreException;
54 
60 @Messages({
61  "DomainCategoryRunner_moduleName_text=Domain Category Analyzer",
62  "DomainCategoryRunner_Progress_Message_Domain_Types=Finding Domain Types",
63  "DomainCategoryRunner_parentModuleName=Recent Activity"
64 })
65 class DomainCategoryRunner extends Extract {
66 
67  // The url regex is based on the regex provided in https://tools.ietf.org/html/rfc3986#appendix-B
68  // but expanded to be a little more flexible. This regex also properly parses user info and port in a url.
69  // this regex has optional colon in front of the scheme (i.e. http// instead of http://) since some urls were coming through without the colon.
70  private static final String URL_REGEX_SCHEME = "(((?<scheme>[^:\\/?#]+):?)?\\/\\/)";
71 
72  private static final String URL_REGEX_USERINFO = "((?<userinfo>[^\\/?#@]*)@)";
73  private static final String URL_REGEX_HOST = "(?<host>[^\\/\\.?#:]*\\.[^\\/?#:]*)";
74  private static final String URL_REGEX_PORT = "(:(?<port>[0-9]{1,5}))";
75  private static final String URL_REGEX_AUTHORITY = String.format("(%s?%s?%s?\\/?)", URL_REGEX_USERINFO, URL_REGEX_HOST, URL_REGEX_PORT);
76 
77  private static final String URL_REGEX_PATH = "(?<path>([^?#]*)(\\?([^#]*))?(#(.*))?)";
78 
79  private static final String URL_REGEX_STR = String.format("^\\s*%s?%s?%s?", URL_REGEX_SCHEME, URL_REGEX_AUTHORITY, URL_REGEX_PATH);
80  private static final Pattern URL_REGEX = Pattern.compile(URL_REGEX_STR);
81 
82  private static int DATETIME_ACCESSED_TYPEID = ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID();
83  private static int URL_TYPEID = ATTRIBUTE_TYPE.TSK_URL.getTypeID();
84 
85  private static final Logger logger = Logger.getLogger(DomainCategoryRunner.class.getName());
86 
87  // NOTE: if CustomWebCategorizer ever changes name, this will need to be changed as well.
88  private static final String CUSTOM_CATEGORIZER_PATH = "org.sleuthkit.autopsy.url.analytics.domaincategorization.CustomWebCategorizer";
89 
90  // the artifact types to be searched for domain categories
91  private static final List<BlackboardArtifact.Type> DOMAIN_CATEGORIZATION_TYPES = Stream.of(
92  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_BOOKMARK,
93  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_CACHE,
94  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_COOKIE,
95  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_DOWNLOAD,
96  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_HISTORY,
97  BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY)
98  .map(BlackboardArtifact.Type::new)
99  .collect(Collectors.toList());
100  private final IngestJobContext context;
101 
111  private static long getTimeOrZero(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
112  if (attrMap == null) {
113  return 0;
114  }
115 
116  BlackboardAttribute attr = attrMap.get(attrTypeId);
117  return attr == null ? 0 : attr.getValueLong();
118  }
119 
129  private static String getStringOrEmpty(Map<Integer, BlackboardAttribute> attrMap, int attrTypeId) {
130  if (attrMap == null) {
131  return "";
132  }
133 
134  BlackboardAttribute attr = attrMap.get(attrTypeId);
135  String attrStr = attr == null ? "" : attr.getValueString();
136  return attrStr == null ? "" : attrStr;
137  }
138 
142  private static final Comparator<BlackboardArtifact> ARTIFACT_COMPARATOR = (a, b) -> {
143  // get attributes in map by type id
144  Map<Integer, BlackboardAttribute> attrMapA = null;
145  Map<Integer, BlackboardAttribute> attrMapB = null;
146 
147  try {
148  attrMapA = a.getAttributes()
149  .stream()
150  .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
151 
152  attrMapB = b.getAttributes()
153  .stream()
154  .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
155 
156  } catch (TskCoreException ex) {
157  logger.log(Level.WARNING, "There was an error fetching attributes for artifacts", ex);
158  return 0;
159  }
160 
161  // sort first on time
162  int timeCompare = Long.compare(getTimeOrZero(attrMapA, DATETIME_ACCESSED_TYPEID), getTimeOrZero(attrMapB, DATETIME_ACCESSED_TYPEID));
163  if (timeCompare != 0) {
164  // negate to push latest times to the front
165  return -timeCompare;
166  }
167 
168  // sort next on url
169  int urlCompare = getStringOrEmpty(attrMapA, URL_TYPEID).compareToIgnoreCase(getStringOrEmpty(attrMapB, URL_TYPEID));
170  if (urlCompare != 0) {
171  return urlCompare;
172  }
173 
174  // use id as last resort
175  return Long.compare(a.getId(), b.getId());
176  };
177 
178  private Content dataSource;
179  private List<DomainCategorizer> domainProviders = Collections.emptyList();
180 
184  DomainCategoryRunner(IngestJobContext context) {
185  super(Bundle.DomainCategoryRunner_moduleName_text(), context);
186  this.context = context;
187  }
188 
197  private String getHost(String urlString) {
198  String host = null;
199  try {
200  // try first using the built-in url class to determine the host.
201  URL url = new URL(urlString);
202  if (url != null) {
203  host = url.getHost();
204  }
205  } catch (MalformedURLException ignore) {
206  // ignore this and go to fallback regex
207  }
208 
209  // if the built-in url parsing doesn't work, then use more flexible regex.
210  if (StringUtils.isBlank(host)) {
211  Matcher m = URL_REGEX.matcher(urlString);
212  if (m.find()) {
213  host = m.group("host");
214  }
215  }
216 
217  return host;
218  }
219 
228  private DomainCategory findCategory(String domain, String host) {
229  List<DomainCategorizer> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
230  for (DomainCategorizer provider : safeProviders) {
231  DomainCategory result;
232  try {
233  result = provider.getCategory(domain, host);
234  if (result != null) {
235  return result;
236  }
237  } catch (DomainCategorizerException ex) {
238  logger.log(Level.WARNING, "There was an error processing results with " + provider.getClass().getCanonicalName(), ex);
239  }
240 
241  }
242 
243  return null;
244  }
245 
249  private static class ArtifactHost {
250 
251  private final AbstractFile abstractFile;
252  private final String host;
253  private final String domain;
254 
264  ArtifactHost(AbstractFile abstractFile, String host, String domain) {
265  this.abstractFile = abstractFile;
266  this.host = host;
267  this.domain = domain;
268  }
269 
273  AbstractFile getAbstractFile() {
274  return abstractFile;
275  }
276 
280  String getHost() {
281  return host;
282  }
283 
287  String getDomain() {
288  return domain;
289  }
290  }
291 
303  private ArtifactHost getDomainAndHost(BlackboardArtifact artifact) throws TskCoreException {
304  // make sure there is attached file
305  AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
306  if (file == null) {
307  return null;
308  }
309 
310  // get the host from the url attribute and the domain from the attribute
311  BlackboardAttribute urlAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
312  String urlString = null;
313  String host = null;
314  if (urlAttr != null) {
315  urlString = urlAttr.getValueString();
316  if (StringUtils.isNotBlank(urlString)) {
317  host = getHost(urlString);
318  }
319  }
320 
321  // get the domain from the attribute
322  BlackboardAttribute domainAttr = artifact.getAttribute(new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
323  String domainString = null;
324  if (domainAttr != null) {
325  domainString = domainAttr.getValueString();
326  }
327 
328  boolean hasDomain = StringUtils.isNotBlank(domainString);
329  boolean hasHost = StringUtils.isNotBlank(host);
330 
331  // we need at least a host or a domain, if one is missing, compensate with the other.
332  if (!hasDomain && !hasHost) {
333  return null;
334  } else if (!hasDomain) {
335  domainString = NetworkUtils.extractDomain(host);
336  } else if (!hasHost) {
337  host = domainString;
338  }
339 
340  return new ArtifactHost(file, host.toLowerCase(), domainString.toLowerCase());
341  }
342 
353  private static boolean isDuplicateOrAdd(Set<String> items, String item) {
354  if (StringUtils.isBlank(item)) {
355  return false;
356  } else if (items.contains(item)) {
357  return true;
358  } else {
359  items.add(item);
360  return false;
361  }
362  }
363 
369  private void findDomainTypes() {
370  int artifactsAnalyzed = 0;
371  int domainTypeInstancesFound = 0;
372 
373  // this will track the different hosts seen to avoid a search for the same host more than once
374  Set<String> hostsSeen = new HashSet<>();
375 
376  // only one suffix per ingest is captured so this tracks the suffixes seen.
377  Set<String> hostSuffixesSeen = new HashSet<>();
378  try {
379  List<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
380  DOMAIN_CATEGORIZATION_TYPES,
381  Arrays.asList(dataSource.getId()));
382 
383  logger.log(Level.INFO, "Processing {0} blackboard artifacts.", listArtifacts.size()); //NON-NLS
384  Collections.sort(listArtifacts, ARTIFACT_COMPARATOR);
385 
386  for (BlackboardArtifact artifact : listArtifacts) {
387  // make sure we haven't cancelled
388  if (context.dataSourceIngestIsCancelled()) {
389  //User cancelled the process.
390  break;
391  }
392 
393  // get the pertinent details for this artifact.
394  ArtifactHost curArtHost = getDomainAndHost(artifact);
395  if (curArtHost == null || isDuplicateOrAdd(hostsSeen, curArtHost.getHost())) {
396  continue;
397  }
398 
399  // if we reached this point, we are at least analyzing this item
400  artifactsAnalyzed++;
401 
402  // attempt to get the domain type for the host using the domain categorizers found
403  DomainCategory domainEntryFound = findCategory(curArtHost.getDomain(), curArtHost.getHost());
404  if (domainEntryFound == null) {
405  continue;
406  }
407 
408  // make sure both the host suffix and the category are present.
409  String hostSuffix = domainEntryFound.getHostSuffix();
410  String domainCategory = domainEntryFound.getCategory();
411  if (StringUtils.isBlank(hostSuffix) || StringUtils.isBlank(domainCategory)) {
412  continue;
413  }
414 
415  // if we got this far, we found a domain type, but it may not be unique
416  domainTypeInstancesFound++;
417 
418  if (isDuplicateOrAdd(hostSuffixesSeen, hostSuffix)) {
419  continue;
420  }
421 
422  // if we got this far, we have a unique domain category to post.
423  addCategoryArtifact(curArtHost, domainCategory);
424  }
425  } catch (TskCoreException e) {
426  logger.log(Level.SEVERE, "Encountered error retrieving artifacts for messaging domains", e); //NON-NLS
427  } finally {
428  if (context.dataSourceIngestIsCancelled()) {
429  logger.info("Operation terminated by user."); //NON-NLS
430  }
431  logger.log(Level.INFO, String.format("Extracted %s distinct messaging domain(s) from the blackboard. "
432  + "Of the %s artifact(s) with valid hosts, %s url(s) contained messaging domain suffix.",
433  hostSuffixesSeen.size(), artifactsAnalyzed, domainTypeInstancesFound));
434  }
435  }
436 
444  private void addCategoryArtifact(ArtifactHost artHost, String domainCategory) throws TskCoreException {
445  String moduleName = Bundle.DomainCategoryRunner_parentModuleName();
446  Collection<BlackboardAttribute> bbattributes = Arrays.asList(
447  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN, moduleName, artHost.getDomain()),
448  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_HOST, moduleName, artHost.getHost()),
449  new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_NAME, moduleName, domainCategory)
450  );
451  postArtifact(createArtifactWithAttributes(BlackboardArtifact.Type.TSK_WEB_CATEGORIZATION, artHost.getAbstractFile(), bbattributes));
452  }
453 
454  @Override
455  public void process(Content dataSource, DataSourceIngestModuleProgress progressBar) {
456  this.dataSource = dataSource;
457  progressBar.progress(Bundle.DomainCategoryRunner_Progress_Message_Domain_Types());
458  this.findDomainTypes();
459  }
460 
461  @Override
462  void startUp() throws IngestModule.IngestModuleException {
463  // lookup all providers, filter null providers, and sort providers
464  Collection<? extends DomainCategorizer> lookupCollection = Lookup.getDefault().lookupAll(DomainCategorizer.class);
465  Collection<? extends DomainCategorizer> lookupList = (lookupCollection == null)
466  ? Collections.emptyList()
467  : lookupCollection;
468 
469  // this will be the class instance of the foundProviders
470  List<DomainCategorizer> foundProviders = new ArrayList<>();
471 
472  // find the custom domain categories provider if present and add it first to the list
473  lookupList.stream()
474  .filter(categorizer -> categorizer.getClass().getName().contains(CUSTOM_CATEGORIZER_PATH))
475  .findFirst()
476  .ifPresent((provider) -> foundProviders.add(provider));
477 
478  // add the default priority categorizer
479  foundProviders.add(new DefaultPriorityDomainCategorizer());
480 
481  // add all others except for the custom web domain categorizer, the default priority
482  // categorizer and the default categorizer
483  lookupList.stream()
484  .filter(categorizer -> categorizer != null)
485  .filter(categorizer -> {
486  String className = categorizer.getClass().getName();
487  return !className.contains(CUSTOM_CATEGORIZER_PATH)
488  && !className.equals(DefaultPriorityDomainCategorizer.class.getName())
489  && !className.equals(DefaultDomainCategorizer.class.getName());
490  })
491  .sorted((a, b) -> a.getClass().getName().compareToIgnoreCase(b.getClass().getName()))
492  .forEach(foundProviders::add);
493 
494  // add the default categorizer last
495  foundProviders.add(new DefaultDomainCategorizer());
496 
497  for (DomainCategorizer provider : foundProviders) {
498  try {
499  provider.initialize();
500  } catch (DomainCategorizerException ex) {
501  throw new IngestModule.IngestModuleException("There was an error instantiating the provider: "
502  + provider.getClass().getSimpleName(), ex);
503  }
504  }
505 
506  this.domainProviders = foundProviders;
507  }
508 
509  @Override
510  public void shutDown() {
511  if (this.domainProviders != null) {
512  for (DomainCategorizer provider : this.domainProviders) {
513  try {
514  provider.close();
515  } catch (Exception ex) {
516  logger.log(Level.WARNING, "There was an error closing " + provider.getClass().getName(), ex);
517  }
518  }
519  }
520  super.shutDown();
521  }
522 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Aug 1 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.