19 package org.sleuthkit.autopsy.recentactivity;
 
   21 import java.net.MalformedURLException;
 
   23 import java.util.ArrayList;
 
   24 import java.util.Arrays;
 
   25 import java.util.Collection;
 
   26 import java.util.Collections;
 
   27 import java.util.Comparator;
 
   28 import java.util.HashSet;
 
   29 import java.util.List;
 
   31 import java.util.logging.Level;
 
   33 import java.util.regex.Matcher;
 
   34 import java.util.regex.Pattern;
 
   35 import java.util.stream.Collectors;
 
   36 import java.util.stream.Stream;
 
   37 import org.apache.commons.lang.StringUtils;
 
   38 import org.openide.util.Lookup;
 
   39 import org.openide.util.NbBundle.Messages;
 
   61     "DomainCategoryRunner_moduleName_text=Domain Category Analyzer",
 
   62     "DomainCategoryRunner_Progress_Message_Domain_Types=Finding Domain Types",
 
   63     "DomainCategoryRunner_parentModuleName=Recent Activity" 
   65 class DomainCategoryRunner extends Extract {
 
   70     private static final String URL_REGEX_SCHEME = 
"(((?<scheme>[^:\\/?#]+):?)?\\/\\/)";
 
   72     private static final String URL_REGEX_USERINFO = 
"((?<userinfo>[^\\/?#@]*)@)";
 
   73     private static final String URL_REGEX_HOST = 
"(?<host>[^\\/\\.?#:]*\\.[^\\/?#:]*)";
 
   74     private static final String URL_REGEX_PORT = 
"(:(?<port>[0-9]{1,5}))";
 
   75     private static final String URL_REGEX_AUTHORITY = String.format(
"(%s?%s?%s?\\/?)", URL_REGEX_USERINFO, URL_REGEX_HOST, URL_REGEX_PORT);
 
   77     private static final String URL_REGEX_PATH = 
"(?<path>([^?#]*)(\\?([^#]*))?(#(.*))?)";
 
   79     private static final String URL_REGEX_STR = String.format(
"^\\s*%s?%s?%s?", URL_REGEX_SCHEME, URL_REGEX_AUTHORITY, URL_REGEX_PATH);
 
   80     private static final Pattern URL_REGEX = Pattern.compile(URL_REGEX_STR);
 
   82     private static int DATETIME_ACCESSED_TYPEID = ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID();
 
   83     private static int URL_TYPEID = ATTRIBUTE_TYPE.TSK_URL.getTypeID();
 
   85     private static final Logger logger = Logger.getLogger(DomainCategoryRunner.class.getName());
 
   88     private static final String CUSTOM_CATEGORIZER_PATH = 
"org.sleuthkit.autopsy.url.analytics.domaincategorization.CustomWebCategorizer";
 
   91     private static final List<BlackboardArtifact.Type> DOMAIN_CATEGORIZATION_TYPES = Stream.of(
 
   92             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_BOOKMARK,
 
   93             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_CACHE,
 
   94             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_COOKIE,
 
   95             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_DOWNLOAD,
 
   96             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_HISTORY,
 
   97             BlackboardArtifact.ARTIFACT_TYPE.TSK_WEB_SEARCH_QUERY)
 
   98             .map(BlackboardArtifact.Type::new)
 
   99             .collect(Collectors.toList());
 
  100     private final IngestJobContext context;
 
  111     private static long getTimeOrZero(Map<Integer, BlackboardAttribute> attrMap, 
int attrTypeId) {
 
  112         if (attrMap == null) {
 
  116         BlackboardAttribute attr = attrMap.get(attrTypeId);
 
  117         return attr == null ? 0 : attr.getValueLong();
 
  129     private static String getStringOrEmpty(Map<Integer, BlackboardAttribute> attrMap, 
int attrTypeId) {
 
  130         if (attrMap == null) {
 
  134         BlackboardAttribute attr = attrMap.get(attrTypeId);
 
  135         String attrStr = attr == null ? 
"" : attr.getValueString();
 
  136         return attrStr == null ? 
"" : attrStr;
 
  142     private static final Comparator<BlackboardArtifact> ARTIFACT_COMPARATOR = (a, b) -> {
 
  144         Map<Integer, BlackboardAttribute> attrMapA = null;
 
  145         Map<Integer, BlackboardAttribute> attrMapB = null;
 
  148             attrMapA = a.getAttributes()
 
  150                     .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
 
  152             attrMapB = b.getAttributes()
 
  154                     .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
 
  156         } 
catch (TskCoreException ex) {
 
  157             logger.log(Level.WARNING, 
"There was an error fetching attributes for artifacts", ex);
 
  162         int timeCompare = Long.compare(getTimeOrZero(attrMapA, DATETIME_ACCESSED_TYPEID), getTimeOrZero(attrMapB, DATETIME_ACCESSED_TYPEID));
 
  163         if (timeCompare != 0) {
 
  169         int urlCompare = getStringOrEmpty(attrMapA, URL_TYPEID).compareToIgnoreCase(getStringOrEmpty(attrMapB, URL_TYPEID));
 
  170         if (urlCompare != 0) {
 
  175         return Long.compare(a.getId(), b.getId());
 
  178     private Content dataSource;
 
  179     private List<DomainCategorizer> domainProviders = Collections.emptyList();
 
  184     DomainCategoryRunner(IngestJobContext context) {
 
  185         super(Bundle.DomainCategoryRunner_moduleName_text(), context);
 
  186         this.context = context;
 
  197     private String getHost(String urlString) {
 
  201             URL url = 
new URL(urlString);
 
  203                 host = url.getHost();
 
  205         } 
catch (MalformedURLException ignore) {
 
  210         if (StringUtils.isBlank(host)) {
 
  211             Matcher m = URL_REGEX.matcher(urlString);
 
  213                 host = m.group(
"host");
 
  228     private DomainCategory findCategory(String domain, String host) {
 
  229         List<DomainCategorizer> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
 
  230         for (DomainCategorizer provider : safeProviders) {
 
  231             DomainCategory result;
 
  233                 result = provider.getCategory(domain, host);
 
  234                 if (result != null) {
 
  237             } 
catch (DomainCategorizerException ex) {
 
  238                 logger.log(Level.WARNING, 
"There was an error processing results with " + provider.getClass().getCanonicalName(), ex);
 
  265             this.abstractFile = abstractFile;
 
  267             this.domain = domain;
 
  303     private ArtifactHost getDomainAndHost(BlackboardArtifact artifact) 
throws TskCoreException {
 
  305         AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
 
  311         BlackboardAttribute urlAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
 
  312         String urlString = null;
 
  314         if (urlAttr != null) {
 
  315             urlString = urlAttr.getValueString();
 
  316             if (StringUtils.isNotBlank(urlString)) {
 
  317                 host = getHost(urlString);
 
  322         BlackboardAttribute domainAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
 
  323         String domainString = null;
 
  324         if (domainAttr != null) {
 
  325             domainString = domainAttr.getValueString();
 
  328         boolean hasDomain = StringUtils.isNotBlank(domainString);
 
  329         boolean hasHost = StringUtils.isNotBlank(host);
 
  332         if (!hasDomain && !hasHost) {
 
  334         } 
else if (!hasDomain) {
 
  335             domainString = NetworkUtils.extractDomain(host);
 
  336         } 
else if (!hasHost) {
 
  340         return new ArtifactHost(file, host.toLowerCase(), domainString.toLowerCase());
 
  353     private static boolean isDuplicateOrAdd(Set<String> items, String item) {
 
  354         if (StringUtils.isBlank(item)) {
 
  356         } 
else if (items.contains(item)) {
 
  369     private void findDomainTypes() {
 
  370         int artifactsAnalyzed = 0;
 
  371         int domainTypeInstancesFound = 0;
 
  374         Set<String> hostsSeen = 
new HashSet<>();
 
  377         Set<String> hostSuffixesSeen = 
new HashSet<>();
 
  379             List<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
 
  380                     DOMAIN_CATEGORIZATION_TYPES,
 
  381                     Arrays.asList(dataSource.getId()));
 
  383             logger.log(Level.INFO, 
"Processing {0} blackboard artifacts.", listArtifacts.size()); 
 
  384             Collections.sort(listArtifacts, ARTIFACT_COMPARATOR);
 
  386             for (BlackboardArtifact artifact : listArtifacts) {
 
  388                 if (context.dataSourceIngestIsCancelled()) {
 
  394                 ArtifactHost curArtHost = getDomainAndHost(artifact);
 
  395                 if (curArtHost == null || isDuplicateOrAdd(hostsSeen, curArtHost.getHost())) {
 
  403                 DomainCategory domainEntryFound = findCategory(curArtHost.getDomain(), curArtHost.getHost());
 
  404                 if (domainEntryFound == null) {
 
  409                 String hostSuffix = domainEntryFound.getHostSuffix();
 
  410                 String domainCategory = domainEntryFound.getCategory();
 
  411                 if (StringUtils.isBlank(hostSuffix) || StringUtils.isBlank(domainCategory)) {
 
  416                 domainTypeInstancesFound++;
 
  418                 if (isDuplicateOrAdd(hostSuffixesSeen, hostSuffix)) {
 
  423                 addCategoryArtifact(curArtHost, domainCategory);
 
  425         } 
catch (TskCoreException e) {
 
  426             logger.log(Level.SEVERE, 
"Encountered error retrieving artifacts for messaging domains", e); 
 
  428             if (context.dataSourceIngestIsCancelled()) {
 
  429                 logger.info(
"Operation terminated by user."); 
 
  431             logger.log(Level.INFO, String.format(
"Extracted %s distinct messaging domain(s) from the blackboard.  " 
  432                     + 
"Of the %s artifact(s) with valid hosts, %s url(s) contained messaging domain suffix.",
 
  433                     hostSuffixesSeen.size(), artifactsAnalyzed, domainTypeInstancesFound));
 
  444     private void addCategoryArtifact(ArtifactHost artHost, String domainCategory) 
throws TskCoreException {
 
  445         String moduleName = Bundle.DomainCategoryRunner_parentModuleName();
 
  446         Collection<BlackboardAttribute> bbattributes = Arrays.asList(
 
  447                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN, moduleName, artHost.getDomain()),
 
  448                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_HOST, moduleName, artHost.getHost()),
 
  449                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_NAME, moduleName, domainCategory)
 
  451         postArtifact(createArtifactWithAttributes(BlackboardArtifact.Type.TSK_WEB_CATEGORIZATION, artHost.getAbstractFile(), bbattributes));
 
  455     public void process(Content dataSource, DataSourceIngestModuleProgress progressBar) {
 
  456         this.dataSource = dataSource;
 
  457         progressBar.progress(Bundle.DomainCategoryRunner_Progress_Message_Domain_Types());
 
  458         this.findDomainTypes();
 
  462     void startUp() throws IngestModule.IngestModuleException {
 
  464         Collection<? extends DomainCategorizer> lookupCollection = Lookup.getDefault().lookupAll(DomainCategorizer.class);
 
  465         Collection<? extends DomainCategorizer> lookupList = (lookupCollection == null)
 
  466                 ? Collections.emptyList()
 
  470         List<DomainCategorizer> foundProviders = 
new ArrayList<>();
 
  474                 .filter(categorizer -> categorizer.getClass().getName().contains(CUSTOM_CATEGORIZER_PATH))
 
  476                 .ifPresent((provider) -> foundProviders.add(provider));
 
  479         foundProviders.add(
new DefaultPriorityDomainCategorizer());
 
  484                 .filter(categorizer -> categorizer != null)
 
  485                 .filter(categorizer -> {
 
  486                     String className = categorizer.getClass().getName();
 
  487                     return !className.contains(CUSTOM_CATEGORIZER_PATH)
 
  488                             && !className.equals(DefaultPriorityDomainCategorizer.class.getName())
 
  489                             && !className.equals(DefaultDomainCategorizer.class.getName());
 
  491                 .sorted((a, b) -> a.getClass().getName().compareToIgnoreCase(b.getClass().getName()))
 
  492                 .forEach(foundProviders::add);
 
  495         foundProviders.add(
new DefaultDomainCategorizer());
 
  497         for (DomainCategorizer provider : foundProviders) {
 
  499                 provider.initialize();
 
  500             } 
catch (DomainCategorizerException ex) {
 
  501                 throw new IngestModule.IngestModuleException(
"There was an error instantiating the provider: " 
  502                         + provider.getClass().getSimpleName(), ex);
 
  506         this.domainProviders = foundProviders;
 
  510     public void shutDown() {
 
  511         if (this.domainProviders != null) {
 
  512             for (DomainCategorizer provider : this.domainProviders) {
 
  515                 } 
catch (Exception ex) {
 
  516                     logger.log(Level.WARNING, 
"There was an error closing " + provider.getClass().getName(), ex);
 
final AbstractFile abstractFile