Autopsy  4.20.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
DomainTokenizer.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2020 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.coreutils;
20 
21 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.nio.charset.StandardCharsets;
26 import java.util.HashMap;
27 import java.util.List;
28 import java.util.stream.Collectors;
29 import java.util.stream.Stream;
30 import org.apache.commons.lang3.StringUtils;
31 
36 class DomainTokenizer {
37 
43  private static class DomainCategory extends HashMap<String, DomainCategory> {
44 
45  private DomainCategory getOrAddChild(String childKey) {
46  DomainCategory cat = this.get(childKey);
47  if (cat == null) {
48  cat = new DomainCategory();
49  this.put(childKey, cat);
50  }
51 
52  return cat;
53  }
54  }
55 
56  // Character for joining domain segments.
57  private static final String JOINER = ".";
58  // delimiter when used with regex
59  private static final String DELIMITER = "\\" + JOINER;
60 
61  private static final String WILDCARD = "*";
62  private static final String EXCEPTION_PREFIX = "!";
63 
64  // taken from https://publicsuffix.org/list/public_suffix_list.dat
65  // file containing line seperated suffixes
66  // rules for parsing can be found here: https://publicsuffix.org/list/
67  private static final String DOMAIN_LIST = "public_suffix_list.dat";
68 
69  // token for comments
70  private static final String COMMENT_TOKEN = "//";
71 
72  // singleton instance of this class.
73  private static DomainTokenizer categorizer = null;
74 
81  static DomainTokenizer getInstance() throws IOException {
82  if (categorizer == null) {
83  categorizer = load();
84  }
85 
86  return categorizer;
87  }
88 
95  private static DomainTokenizer load() throws IOException {
96  try (InputStream is = DomainTokenizer.class.getResourceAsStream(DOMAIN_LIST);
97  InputStreamReader isReader = new InputStreamReader(is, StandardCharsets.UTF_8);
98  BufferedReader reader = new BufferedReader(isReader)) {
99 
100  DomainTokenizer categorizer = new DomainTokenizer();
101  while (reader.ready()) {
102  String line = reader.readLine();
103  String trimmed = line.trim();
104  if (!StringUtils.isBlank(trimmed) && !trimmed.startsWith(COMMENT_TOKEN)) {
105  categorizer.addDomainSuffix(trimmed);
106  }
107  }
108 
109  return categorizer;
110  }
111  }
112 
113  private DomainTokenizer() {
114  }
115 
116  // The top-level trie node.
117  private final DomainCategory trie = new DomainCategory();
118 
125  private void addDomainSuffix(String domainSuffix) {
126  if (StringUtils.isBlank(domainSuffix)) {
127  return;
128  }
129 
130  String[] tokens = domainSuffix.toLowerCase().trim().split(DELIMITER);
131 
132  DomainCategory cat = trie;
133  for (int i = tokens.length - 1; i >= 0; i--) {
134  String token = tokens[i];
135  if (StringUtils.isBlank(token)) {
136  continue;
137  }
138 
139  cat = cat.getOrAddChild(tokens[i]);
140  }
141  }
142 
153  String getDomain(String domain) {
154  if (StringUtils.isBlank(domain)) {
155  return "";
156  }
157 
158  List<String> tokens = Stream.of(domain.toLowerCase().split(DELIMITER))
159  .filter(StringUtils::isNotBlank)
160  .collect(Collectors.toList());
161 
162  int idx = tokens.size() - 1;
163  DomainCategory cat = trie;
164 
165  for (; idx >= 0; idx--) {
166  // an exception rule must be at the beginning of a suffix, and, in
167  // practice, indicates a domain that would otherwise be a further
168  // suffix with a wildcard rule per: https://publicsuffix.org/list/
169  if (cat.get(EXCEPTION_PREFIX + tokens.get(idx)) != null) {
170  break;
171  }
172 
173  DomainCategory newCat = cat.get(tokens.get(idx));
174 
175  // if no matching token can be found, look for wildcard token
176  if (newCat == null) {
177  // if no wildcard token can be found, the portion found
178  // so far is the suffix.
179  newCat = cat.get(WILDCARD);
180  if (newCat == null) {
181  break;
182  }
183  }
184 
185  cat = newCat;
186  }
187 
188  // if first suffix cannot be found, return the whole domain
189  if (idx == tokens.size() - 1) {
190  return domain;
191  } else {
192  int minIndex = Math.max(0, idx);
193  List<String> subList = tokens.subList(minIndex, tokens.size());
194  return String.join(JOINER, subList);
195  }
196  }
197 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Aug 1 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.