19 package org.sleuthkit.autopsy.recentactivity;
 
   21 import java.net.MalformedURLException;
 
   23 import java.util.ArrayList;
 
   24 import java.util.Arrays;
 
   25 import java.util.Collection;
 
   26 import java.util.Collections;
 
   27 import java.util.Comparator;
 
   28 import java.util.HashSet;
 
   29 import java.util.List;
 
   31 import java.util.logging.Level;
 
   33 import java.util.regex.Matcher;
 
   34 import java.util.regex.Pattern;
 
   35 import java.util.stream.Collectors;
 
   36 import java.util.stream.Stream;
 
   37 import org.apache.commons.lang.StringUtils;
 
   38 import org.openide.util.Lookup;
 
   39 import org.openide.util.NbBundle.Messages;
 
   47 import org.
sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
 
   49 import org.
sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
 
   62     "DomainCategoryRunner_moduleName_text=DomainCategoryRunner",
 
   63     "DomainCategoryRunner_Progress_Message_Domain_Types=Finding Domain Types",
 
   64     "DomainCategoryRunner_parentModuleName=Recent Activity" 
   66 class DomainCategoryRunner extends Extract {
 
   71     private static final String URL_REGEX_SCHEME = 
"(((?<scheme>[^:\\/?#]+):?)?\\/\\/)";
 
   73     private static final String URL_REGEX_USERINFO = 
"((?<userinfo>[^\\/?#@]*)@)";
 
   74     private static final String URL_REGEX_HOST = 
"(?<host>[^\\/\\.?#:]*\\.[^\\/?#:]*)";
 
   75     private static final String URL_REGEX_PORT = 
"(:(?<port>[0-9]{1,5}))";
 
   76     private static final String URL_REGEX_AUTHORITY = String.format(
"(%s?%s?%s?\\/?)", URL_REGEX_USERINFO, URL_REGEX_HOST, URL_REGEX_PORT);
 
   78     private static final String URL_REGEX_PATH = 
"(?<path>([^?#]*)(\\?([^#]*))?(#(.*))?)";
 
   80     private static final String URL_REGEX_STR = String.format(
"^\\s*%s?%s?%s?", URL_REGEX_SCHEME, URL_REGEX_AUTHORITY, URL_REGEX_PATH);
 
   81     private static final Pattern URL_REGEX = Pattern.compile(URL_REGEX_STR);
 
   83     private static int DATETIME_ACCESSED_TYPEID = ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID();
 
   84     private static int URL_TYPEID = ATTRIBUTE_TYPE.TSK_URL.getTypeID();
 
   86     private static final Logger logger = Logger.getLogger(DomainCategoryRunner.class.getName());
 
   89     private static final String CUSTOM_CATEGORIZER_PATH = 
"org.sleuthkit.autopsy.url.analytics.domaincategorization.CustomWebCategorizer";
 
   92     private static final List<BlackboardArtifact.Type> DOMAIN_CATEGORIZATION_TYPES = Stream.of(
 
   93             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_BOOKMARK,
 
   94             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_CACHE,
 
   95             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_COOKIE,
 
   96             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_DOWNLOAD,
 
   97             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_HISTORY,
 
   98             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY)
 
   99             .map(BlackboardArtifact.Type::new)
 
  100             .collect(Collectors.toList());
 
  110     private static long getTimeOrZero(Map<Integer, BlackboardAttribute> attrMap, 
int attrTypeId) {
 
  111         if (attrMap == null) {
 
  115         BlackboardAttribute attr = attrMap.get(attrTypeId);
 
  116         return attr == null ? 0 : attr.getValueLong();
 
  127     private static String getStringOrEmpty(Map<Integer, BlackboardAttribute> attrMap, 
int attrTypeId) {
 
  128         if (attrMap == null) {
 
  132         BlackboardAttribute attr = attrMap.get(attrTypeId);
 
  133         String attrStr = attr == null ? 
"" : attr.getValueString();
 
  134         return attrStr == null ? 
"" : attrStr;
 
  140     private static final Comparator<BlackboardArtifact> ARTIFACT_COMPARATOR = (a, b) -> {
 
  142         Map<Integer, BlackboardAttribute> attrMapA = null;
 
  143         Map<Integer, BlackboardAttribute> attrMapB = null;
 
  146             attrMapA = a.getAttributes()
 
  148                     .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
 
  150             attrMapB = b.getAttributes()
 
  152                     .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
 
  154         } 
catch (TskCoreException ex) {
 
  155             logger.log(Level.WARNING, 
"There was an error fetching attributes for artifacts", ex);
 
  160         int timeCompare = Long.compare(getTimeOrZero(attrMapA, DATETIME_ACCESSED_TYPEID), getTimeOrZero(attrMapB, DATETIME_ACCESSED_TYPEID));
 
  161         if (timeCompare != 0) {
 
  167         int urlCompare = getStringOrEmpty(attrMapA, URL_TYPEID).compareToIgnoreCase(getStringOrEmpty(attrMapB, URL_TYPEID));
 
  168         if (urlCompare != 0) {
 
  173         return Long.compare(a.getId(), b.getId());
 
  176     private Content dataSource;
 
  177     private IngestJobContext context;
 
  178     private List<DomainCategorizer> domainProviders = Collections.emptyList();
 
  183     DomainCategoryRunner() {
 
  194     private String getHost(String urlString) {
 
  198             URL url = 
new URL(urlString);
 
  200                 host = url.getHost();
 
  202         } 
catch (MalformedURLException ignore) {
 
  207         if (StringUtils.isBlank(host)) {
 
  208             Matcher m = URL_REGEX.matcher(urlString);
 
  210                 host = m.group(
"host");
 
  224     private DomainCategory findCategory(String domain, String host) {
 
  225         List<DomainCategorizer> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
 
  226         for (DomainCategorizer provider : safeProviders) {
 
  227             DomainCategory result;
 
  229                 result = provider.getCategory(domain, host);
 
  230                 if (result != null) {
 
  233             } 
catch (DomainCategorizerException ex) {
 
  234                 logger.log(Level.WARNING, 
"There was an error processing results with " + provider.getClass().getCanonicalName(), ex);
 
  258         ArtifactHost(AbstractFile abstractFile, String host, String domain) {
 
  259             this.abstractFile = abstractFile;
 
  261             this.domain = domain;
 
  267         AbstractFile getAbstractFile() {
 
  295     private ArtifactHost getDomainAndHost(BlackboardArtifact artifact) 
throws TskCoreException {
 
  297         AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
 
  303         BlackboardAttribute urlAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
 
  304         String urlString = null;
 
  306         if (urlAttr != null) {
 
  307             urlString = urlAttr.getValueString();
 
  308             if (StringUtils.isNotBlank(urlString)) {
 
  309                 host = getHost(urlString);
 
  314         BlackboardAttribute domainAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
 
  315         String domainString = null;
 
  316         if (domainAttr != null) {
 
  317             domainString = domainAttr.getValueString();
 
  320         boolean hasDomain = StringUtils.isNotBlank(domainString);
 
  321         boolean hasHost = StringUtils.isNotBlank(host);
 
  324         if (!hasDomain && !hasHost) {
 
  326         } 
else if (!hasDomain) {
 
  327             domainString = NetworkUtils.extractDomain(host);
 
  328         } 
else if (!hasHost) {
 
  332         return new ArtifactHost(file, host.toLowerCase(), domainString.toLowerCase());
 
  344     private static boolean isDuplicateOrAdd(Set<String> items, String item) {
 
  345         if (StringUtils.isBlank(item)) {
 
  347         } 
else if (items.contains(item)) {
 
  360     private void findDomainTypes() {
 
  361         int artifactsAnalyzed = 0;
 
  362         int domainTypeInstancesFound = 0;
 
  365         Set<String> hostsSeen = 
new HashSet<>();
 
  368         Set<String> hostSuffixesSeen = 
new HashSet<>();
 
  370             List<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
 
  371                     DOMAIN_CATEGORIZATION_TYPES,
 
  372                     Arrays.asList(dataSource.getId()));
 
  374             logger.log(Level.INFO, 
"Processing {0} blackboard artifacts.", listArtifacts.size()); 
 
  375             Collections.sort(listArtifacts, ARTIFACT_COMPARATOR);
 
  377             for (BlackboardArtifact artifact : listArtifacts) {
 
  379                 if (context.dataSourceIngestIsCancelled()) {
 
  385                 ArtifactHost curArtHost = getDomainAndHost(artifact);
 
  386                 if (curArtHost == null || isDuplicateOrAdd(hostsSeen, curArtHost.getHost())) {
 
  394                 DomainCategory domainEntryFound = findCategory(curArtHost.getDomain(), curArtHost.getHost());
 
  395                 if (domainEntryFound == null) {
 
  400                 String hostSuffix = domainEntryFound.getHostSuffix();
 
  401                 String domainCategory = domainEntryFound.getCategory();
 
  402                 if (StringUtils.isBlank(hostSuffix) || StringUtils.isBlank(domainCategory)) {
 
  407                 domainTypeInstancesFound++;
 
  409                 if (isDuplicateOrAdd(hostSuffixesSeen, hostSuffix)) {
 
  414                 addCategoryArtifact(curArtHost, domainCategory);
 
  416         } 
catch (TskCoreException e) {
 
  417             logger.log(Level.SEVERE, 
"Encountered error retrieving artifacts for messaging domains", e); 
 
  419             if (context.dataSourceIngestIsCancelled()) {
 
  420                 logger.info(
"Operation terminated by user."); 
 
  422             logger.log(Level.INFO, String.format(
"Extracted %s distinct messaging domain(s) from the blackboard.  " 
  423                     + 
"Of the %s artifact(s) with valid hosts, %s url(s) contained messaging domain suffix.",
 
  424                     hostSuffixesSeen.size(), artifactsAnalyzed, domainTypeInstancesFound));
 
  435     private void addCategoryArtifact(ArtifactHost artHost, String domainCategory) 
throws TskCoreException {
 
  436         String moduleName = Bundle.DomainCategoryRunner_parentModuleName();
 
  437         Collection<BlackboardAttribute> bbattributes = Arrays.asList(
 
  438                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN, moduleName, artHost.getDomain()),
 
  439                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_HOST, moduleName, artHost.getHost()),
 
  440                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_NAME, moduleName, domainCategory)
 
  442         postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_CATEGORIZATION, artHost.getAbstractFile(), bbattributes));
 
  446     public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
 
  447         this.dataSource = dataSource;
 
  448         this.context = context;
 
  450         progressBar.progress(Bundle.DomainCategoryRunner_Progress_Message_Domain_Types());
 
  451         this.findDomainTypes();
 
  455     void configExtractor() throws IngestModule.IngestModuleException {
 
  457         Collection<? extends DomainCategorizer> lookupCollection = Lookup.getDefault().lookupAll(DomainCategorizer.class);
 
  458         Collection<? extends DomainCategorizer> lookupList = (lookupCollection == null) ? 
 
  459                 Collections.emptyList() :
 
  463         List<DomainCategorizer> foundProviders = 
new ArrayList<>();
 
  467                 .filter(categorizer -> categorizer.getClass().getName().contains(CUSTOM_CATEGORIZER_PATH))
 
  469                 .ifPresent((provider) -> foundProviders.add(provider));
 
  472         foundProviders.add(
new DefaultPriorityDomainCategorizer());
 
  477                 .filter(categorizer -> categorizer != null)
 
  478                 .filter(categorizer -> {
 
  479                     String className = categorizer.getClass().getName();
 
  480                     return !className.contains(CUSTOM_CATEGORIZER_PATH) &&
 
  481                             !className.equals(DefaultPriorityDomainCategorizer.class.getName()) &&
 
  482                             !className.equals(DefaultDomainCategorizer.class.getName());
 
  484                 .sorted((a, b) -> a.getClass().getName().compareToIgnoreCase(b.getClass().getName()))
 
  485                 .forEach(foundProviders::add);
 
  488         foundProviders.add(
new DefaultDomainCategorizer());
 
  490         for (DomainCategorizer provider : foundProviders) {
 
  492                 provider.initialize();
 
  493             } 
catch (DomainCategorizerException ex) {
 
  494                 throw new IngestModule.IngestModuleException(
"There was an error instantiating the provider: " + 
 
  495                         provider.getClass().getSimpleName(), ex);
 
  499         this.domainProviders = foundProviders;
 
  503     public void complete() {
 
  504         if (this.domainProviders != null) {
 
  505             for (DomainCategorizer provider : this.domainProviders) {
 
  508                 } 
catch (Exception ex) {
 
  509                     logger.log(Level.WARNING, 
"There was an error closing " + provider.getClass().getName(), ex);
 
  514         logger.info(
"Domain categorization completed."); 
 
final AbstractFile abstractFile