19 package org.sleuthkit.autopsy.recentactivity;
 
   21 import java.net.MalformedURLException;
 
   23 import java.util.Arrays;
 
   24 import java.util.Collection;
 
   25 import java.util.Collections;
 
   26 import java.util.Comparator;
 
   27 import java.util.HashSet;
 
   28 import java.util.List;
 
   30 import java.util.logging.Level;
 
   32 import java.util.regex.Matcher;
 
   33 import java.util.regex.Pattern;
 
   34 import java.util.stream.Collectors;
 
   35 import org.apache.commons.lang.StringUtils;
 
   36 import org.openide.util.Lookup;
 
   37 import org.openide.util.NbBundle.Messages;
 
   45 import org.
sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
 
   47 import org.
sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
 
   60     "DomainCategoryRunner_moduleName_text=DomainCategoryRunner",
 
   61     "DomainCategoryRunner_Progress_Message_Domain_Types=Finding Domain Types",
 
   62     "DomainCategoryRunner_parentModuleName=Recent Activity" 
   64 class DomainCategoryRunner extends Extract {
 
   69     private static final String URL_REGEX_SCHEME = 
"(((?<scheme>[^:\\/?#]+):?)?\\/\\/)";
 
   71     private static final String URL_REGEX_USERINFO = 
"((?<userinfo>[^\\/?#@]*)@)";
 
   72     private static final String URL_REGEX_HOST = 
"(?<host>[^\\/\\.?#:]*\\.[^\\/?#:]*)";
 
   73     private static final String URL_REGEX_PORT = 
"(:(?<port>[0-9]{1,5}))";
 
   74     private static final String URL_REGEX_AUTHORITY = String.format(
"(%s?%s?%s?\\/?)", URL_REGEX_USERINFO, URL_REGEX_HOST, URL_REGEX_PORT);
 
   76     private static final String URL_REGEX_PATH = 
"(?<path>([^?#]*)(\\?([^#]*))?(#(.*))?)";
 
   78     private static final String URL_REGEX_STR = String.format(
"^\\s*%s?%s?%s?", URL_REGEX_SCHEME, URL_REGEX_AUTHORITY, URL_REGEX_PATH);
 
   79     private static final Pattern URL_REGEX = Pattern.compile(URL_REGEX_STR);
 
   81     private static int DATETIME_ACCESSED_TYPEID = ATTRIBUTE_TYPE.TSK_DATETIME_ACCESSED.getTypeID();
 
   82     private static int URL_TYPEID = ATTRIBUTE_TYPE.TSK_URL.getTypeID();
 
   84     private static final Logger logger = Logger.getLogger(DomainCategoryRunner.class.getName());
 
   94     private static long getTimeOrZero(Map<Integer, BlackboardAttribute> attrMap, 
int attrTypeId) {
 
   95         if (attrMap == null) {
 
   99         BlackboardAttribute attr = attrMap.get(attrTypeId);
 
  100         return attr == null ? 0 : attr.getValueLong();
 
  111     private static String getStringOrEmpty(Map<Integer, BlackboardAttribute> attrMap, 
int attrTypeId) {
 
  112         if (attrMap == null) {
 
  116         BlackboardAttribute attr = attrMap.get(attrTypeId);
 
  117         String attrStr = attr == null ? 
"" : attr.getValueString();
 
  118         return attrStr == null ? 
"" : attrStr;
 
  124     private static final Comparator<BlackboardArtifact> ARTIFACT_COMPARATOR = (a, b) -> {
 
  126         Map<Integer, BlackboardAttribute> attrMapA = null;
 
  127         Map<Integer, BlackboardAttribute> attrMapB = null;
 
  130             attrMapA = a.getAttributes()
 
  132                     .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
 
  134             attrMapB = b.getAttributes()
 
  136                     .collect(Collectors.toMap(attr -> attr.getAttributeType().getTypeID(), attr -> attr, (attr1, attr2) -> attr1));
 
  138         } 
catch (TskCoreException ex) {
 
  139             logger.log(Level.WARNING, 
"There was an error fetching attributes for artifacts", ex);
 
  144         int timeCompare = Long.compare(getTimeOrZero(attrMapA, DATETIME_ACCESSED_TYPEID), getTimeOrZero(attrMapB, DATETIME_ACCESSED_TYPEID));
 
  145         if (timeCompare != 0) {
 
  151         int urlCompare = getStringOrEmpty(attrMapA, URL_TYPEID).compareToIgnoreCase(getStringOrEmpty(attrMapB, URL_TYPEID));
 
  152         if (urlCompare != 0) {
 
  157         return Long.compare(a.getId(), b.getId());
 
  160     private Content dataSource;
 
  161     private IngestJobContext context;
 
  162     private List<DomainCategorizer> domainProviders = Collections.emptyList();
 
  167     DomainCategoryRunner() {
 
  178     private String getHost(String urlString) {
 
  182             URL url = 
new URL(urlString);
 
  184                 host = url.getHost();
 
  186         } 
catch (MalformedURLException ignore) {
 
  191         if (StringUtils.isBlank(host)) {
 
  192             Matcher m = URL_REGEX.matcher(urlString);
 
  194                 host = m.group(
"host");
 
  208     private DomainCategory findCategory(String domain, String host) {
 
  209         List<DomainCategorizer> safeProviders = domainProviders == null ? Collections.emptyList() : domainProviders;
 
  210         for (DomainCategorizer provider : safeProviders) {
 
  211             DomainCategory result;
 
  213                 result = provider.getCategory(domain, host);
 
  214                 if (result != null) {
 
  217             } 
catch (DomainCategorizerException ex) {
 
  218                 logger.log(Level.WARNING, 
"There was an error processing results with " + provider.getClass().getCanonicalName(), ex);
 
  242         ArtifactHost(AbstractFile abstractFile, String host, String domain) {
 
  243             this.abstractFile = abstractFile;
 
  245             this.domain = domain;
 
  251         AbstractFile getAbstractFile() {
 
  279     private ArtifactHost getDomainAndHost(BlackboardArtifact artifact) 
throws TskCoreException {
 
  281         AbstractFile file = tskCase.getAbstractFileById(artifact.getObjectID());
 
  287         BlackboardAttribute urlAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL));
 
  288         String urlString = null;
 
  290         if (urlAttr != null) {
 
  291             urlString = urlAttr.getValueString();
 
  292             if (StringUtils.isNotBlank(urlString)) {
 
  293                 host = getHost(urlString);
 
  298         BlackboardAttribute domainAttr = artifact.getAttribute(
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_DOMAIN));
 
  299         String domainString = null;
 
  300         if (domainAttr != null) {
 
  301             domainString = domainAttr.getValueString();
 
  304         boolean hasDomain = StringUtils.isNotBlank(domainString);
 
  305         boolean hasHost = StringUtils.isNotBlank(host);
 
  308         if (!hasDomain && !hasHost) {
 
  310         } 
else if (!hasDomain) {
 
  311             domainString = NetworkUtils.extractDomain(host);
 
  312         } 
else if (!hasHost) {
 
  316         return new ArtifactHost(file, host.toLowerCase(), domainString.toLowerCase());
 
  328     private static boolean isDuplicateOrAdd(Set<String> items, String item) {
 
  329         if (StringUtils.isBlank(item)) {
 
  331         } 
else if (items.contains(item)) {
 
  344     private void findDomainTypes() {
 
  345         int artifactsAnalyzed = 0;
 
  346         int domainTypeInstancesFound = 0;
 
  349         Set<String> hostsSeen = 
new HashSet<>();
 
  352         Set<String> hostSuffixesSeen = 
new HashSet<>();
 
  354             List<BlackboardArtifact> listArtifacts = currentCase.getSleuthkitCase().getBlackboard().getArtifacts(
 
  355                     Arrays.asList(
new BlackboardArtifact.Type(ARTIFACT_TYPE.TSK_WEB_HISTORY)),
 
  356                     Arrays.asList(dataSource.getId()));
 
  358             logger.log(Level.INFO, 
"Processing {0} blackboard artifacts.", listArtifacts.size()); 
 
  359             Collections.sort(listArtifacts, ARTIFACT_COMPARATOR);
 
  361             for (BlackboardArtifact artifact : listArtifacts) {
 
  363                 if (context.dataSourceIngestIsCancelled()) {
 
  368                 ArtifactHost curArtHost = getDomainAndHost(artifact);
 
  369                 if (curArtHost == null || isDuplicateOrAdd(hostsSeen, curArtHost.getHost())) {
 
  377                 DomainCategory domainEntryFound = findCategory(curArtHost.getDomain(), curArtHost.getHost());
 
  378                 if (domainEntryFound == null) {
 
  383                 String hostSuffix = domainEntryFound.getHostSuffix();
 
  384                 String domainCategory = domainEntryFound.getCategory();
 
  385                 if (StringUtils.isBlank(hostSuffix) || StringUtils.isBlank(domainCategory)) {
 
  390                 domainTypeInstancesFound++;
 
  392                 if (isDuplicateOrAdd(hostSuffixesSeen, hostSuffix)) {
 
  397                 addCategoryArtifact(curArtHost, domainCategory);
 
  399         } 
catch (TskCoreException e) {
 
  400             logger.log(Level.SEVERE, 
"Encountered error retrieving artifacts for messaging domains", e); 
 
  402             if (context.dataSourceIngestIsCancelled()) {
 
  403                 logger.info(
"Operation terminated by user."); 
 
  405             logger.log(Level.INFO, String.format(
"Extracted %s distinct messaging domain(s) from the blackboard.  " 
  406                     + 
"Of the %s artifact(s) with valid hosts, %s url(s) contained messaging domain suffix.",
 
  407                     hostSuffixesSeen.size(), artifactsAnalyzed, domainTypeInstancesFound));
 
  418     private void addCategoryArtifact(ArtifactHost artHost, String domainCategory) {
 
  419         String moduleName = Bundle.DomainCategoryRunner_parentModuleName();
 
  420         Collection<BlackboardAttribute> bbattributes = Arrays.asList(
 
  421                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_DOMAIN, moduleName, artHost.getDomain()),
 
  422                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_HOST, moduleName, artHost.getHost()),
 
  423                 new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_NAME, moduleName, domainCategory)
 
  425         postArtifact(createArtifactWithAttributes(ARTIFACT_TYPE.TSK_WEB_CATEGORIZATION, artHost.getAbstractFile(), bbattributes));
 
  429     public void process(Content dataSource, IngestJobContext context, DataSourceIngestModuleProgress progressBar) {
 
  430         this.dataSource = dataSource;
 
  431         this.context = context;
 
  433         progressBar.progress(Bundle.DomainCategoryRunner_Progress_Message_Domain_Types());
 
  434         this.findDomainTypes();
 
  438     void configExtractor() throws IngestModule.IngestModuleException {
 
  440         Collection<? extends DomainCategorizer> lookupList = Lookup.getDefault().lookupAll(DomainCategorizer.class);
 
  441         if (lookupList == null) {
 
  442             lookupList = Collections.emptyList();
 
  445         List<DomainCategorizer> foundProviders = lookupList.stream()
 
  446                 .filter(provider -> provider != null)
 
  447                 .sorted((a, b) -> a.getClass().getName().compareToIgnoreCase(b.getClass().getName()))
 
  448                 .collect(Collectors.toList());
 
  451         foundProviders.add(
new DefaultDomainCategorizer());
 
  453         for (DomainCategorizer provider : foundProviders) {
 
  455                 provider.initialize();
 
  456             } 
catch (DomainCategorizerException ex) {
 
  457                 throw new IngestModule.IngestModuleException(
"There was an error instantiating the provider: " + provider.getClass().getSimpleName(), ex);
 
  461         this.domainProviders = foundProviders;
 
  465     public void complete() {
 
  466         if (this.domainProviders != null) {
 
  467             for (DomainCategorizer provider : this.domainProviders) {
 
  470                 } 
catch (Exception ex) {
 
  471                     logger.log(Level.WARNING, 
"There was an error closing " + provider.getClass().getName(), ex);
 
  476         logger.info(
"Domain categorization completed."); 
 
final AbstractFile abstractFile