Autopsy  4.20.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
InlineSearcher.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2022 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.twelvemonkeys.lang.StringUtil;
22 import java.io.IOException;
23 import java.util.ArrayList;
24 import java.util.Comparator;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Objects;
29 import java.util.concurrent.ConcurrentHashMap;
30 import java.util.logging.Level;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33 import org.apache.commons.validator.routines.DomainValidator;
34 import org.apache.lucene.analysis.Analyzer;
35 import org.apache.lucene.analysis.TokenStream;
36 import org.apache.lucene.analysis.standard.StandardAnalyzer;
37 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
38 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
44 import static org.sleuthkit.autopsy.keywordsearch.RegexQuery.CREDIT_CARD_NUM_PATTERN;
45 import org.sleuthkit.datamodel.Blackboard;
46 import org.sleuthkit.datamodel.BlackboardArtifact;
47 import org.sleuthkit.datamodel.BlackboardAttribute;
48 import org.sleuthkit.datamodel.Content;
49 import org.sleuthkit.datamodel.SleuthkitCase;
50 import org.sleuthkit.datamodel.TskCoreException;
51 import org.sleuthkit.datamodel.TskException;
52 
53 final class InlineSearcher {
54 
55  private final List<KeywordList> keywordList;
56  private static final int MIN_EMAIL_ADDR_LENGTH = 8;
57  private static final Logger logger = Logger.getLogger(InlineSearcher.class.getName());
58 
59  private final IngestJobContext context;
60 
61  static final Map<Long, List<UniqueKeywordHit>> uniqueHitMap = new ConcurrentHashMap<>();
62 
63  static final Map<Long, Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>>> uniqueHitMap2 = new ConcurrentHashMap<>();
64 
65  // Uses mostly native java and the lucene api to search the a given chuck
66  // for Keywords. Create unique KeywordHits for any unique hit.
67  InlineSearcher(List<String> keywordListNames, IngestJobContext context) {
68  this.keywordList = new ArrayList<>();
69  this.context = context;
70 
71  if (keywordListNames != null) {
72  XmlKeywordSearchList loader = XmlKeywordSearchList.getCurrent();
73  for (String name : keywordListNames) {
74  keywordList.add(loader.getList(name));
75  }
76  }
77  }
78 
87  boolean searchChunk(Chunk chunk, long sourceID, int chunkId) throws TskCoreException {
88  return searchString(chunk.getLowerCasedChunk(), sourceID, chunkId);
89  }
90 
99  boolean searchString(String text, long sourceID, int chunkId) throws TskCoreException {
100  boolean hitFound = false;
101  Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> hitByKeyword = getMap(context.getJobId(), sourceID);
102  for (KeywordList list : keywordList) {
103  List<Keyword> keywords = list.getKeywords();
104  for (Keyword originalKeyword : keywords) {
105  Map<Keyword, List<UniqueKeywordHit>> hitMap = hitByKeyword.get(originalKeyword);
106  if (hitMap == null) {
107  hitMap = new HashMap<>();
108  hitByKeyword.put(originalKeyword, hitMap);
109  }
110 
111  List<UniqueKeywordHit> keywordHits = new ArrayList<>();
112  if (originalKeyword.searchTermIsLiteral()) {
113  if (StringUtil.containsIgnoreCase(text, originalKeyword.getSearchTerm())) {
114  keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID, chunkId, list.getName()));
115  }
116  } else {
117  String regex = originalKeyword.getSearchTerm();
118 
119  try {
120  // validate the regex
121  Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
122  Matcher matcher = pattern.matcher(text);
123 
124  if (matcher.find()) {
125  keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID, chunkId, list.getName()));
126  }
127  } catch (IllegalArgumentException ex) {
128  //TODO What should we do here? Log and continue?
129  }
130  }
131 
132  if (!keywordHits.isEmpty()) {
133  hitFound = true;
134  for (UniqueKeywordHit hit : keywordHits) {
135  Keyword keywordCopy = new Keyword(hit.getHit(),
136  originalKeyword.searchTermIsLiteral(),
137  originalKeyword.searchTermIsWholeWord(),
138  list.getName(),
139  originalKeyword.getOriginalTerm());
140 
141  List<UniqueKeywordHit> mapHitList = hitMap.get(keywordCopy);
142  if (mapHitList == null) {
143  mapHitList = new ArrayList<>();
144  hitMap.put(keywordCopy, mapHitList);
145  }
146 
147  if (!mapHitList.contains(hit)) {
148  mapHitList.add(hit);
149  }
150  }
151  }
152 
153  if (context.fileIngestIsCancelled()) {
154  return hitFound;
155  }
156  }
157  }
158  return hitFound;
159  }
160 
172  private List<UniqueKeywordHit> createKeywordHits(String text, Keyword originalKeyword, long sourceID, int chunkId, String keywordListName) throws TskCoreException {
173 
174  if (originalKeyword.searchTermIsLiteral() && originalKeyword.searchTermIsWholeWord()) {
175  try {
176  return getExactMatchHits(text, originalKeyword, sourceID, chunkId, keywordListName);
177  } catch (IOException ex) {
178  throw new TskCoreException("Failed to create exactMatch hits", ex);
179  }
180  }
181 
182  final HashMap<String, String> keywordsFoundInThisDocument = new HashMap<>();
183 
184  List<UniqueKeywordHit> hits = new ArrayList<>();
185  String keywordString = originalKeyword.getSearchTerm();
186 
187  boolean queryStringContainsWildcardSuffix = originalKeyword.getSearchTerm().endsWith(".*");
188 
189  String searchPattern;
190  if (originalKeyword.searchTermIsLiteral()) {
204  searchPattern = "[\\w[\\.']]*" + java.util.regex.Pattern.quote(keywordString.toLowerCase()) + "[\\w[\\.']]*";
205 
206  } else {
207  searchPattern = keywordString;
208  }
209 
210  final java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(searchPattern, Pattern.CASE_INSENSITIVE);
211 
212  try {
213  String content = text;
214  Matcher hitMatcher = pattern.matcher(content);
215  int offset = 0;
216 
217  while (hitMatcher.find(offset)) {
218 
219  String hit = hitMatcher.group().toLowerCase();
220 
225  if ("".equals(hit)) {
226  break;
227  }
228 
229  offset = hitMatcher.end();
230  final BlackboardAttribute.ATTRIBUTE_TYPE artifactAttributeType = originalKeyword.getArtifactAttributeType();
231 
232  // We attempt to reduce false positives for phone numbers and IP address hits
233  // by querying Solr for hits delimited by a set of known boundary characters.
234  // See KeywordSearchList.PHONE_NUMBER_REGEX for an example.
235  // Because of this the hits may contain an extra character at the beginning or end that
236  // needs to be chopped off, unless the user has supplied their own wildcard suffix
237  // as part of the regex.
238  if (!queryStringContainsWildcardSuffix
239  && (artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PHONE_NUMBER
240  || artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_IP_ADDRESS)) {
241  if (artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PHONE_NUMBER) {
242  // For phone numbers replace all non numeric characters (except "(") at the start of the hit.
243  hit = hit.replaceAll("^[^0-9\\(]", "");
244  } else {
245  // Replace all non numeric characters at the start of the hit.
246  hit = hit.replaceAll("^[^0-9]", "");
247  }
248  // Replace all non numeric at the end of the hit.
249  hit = hit.replaceAll("[^0-9]$", "");
250 
251  if (offset > 1) {
252  /*
253  * NOTE: our IP and phone number regex patterns look for
254  * boundary characters immediately before and after the
255  * keyword hit. After a match, Java pattern mather
256  * re-starts at the first character not matched by the
257  * previous match. This basically requires two boundary
258  * characters to be present between each pattern match.
259  * To mitigate this we are resetting the offest one
260  * character back.
261  */
262  offset--;
263  }
264  }
265 
273  if (originalKeyword.searchTermIsLiteral()) {
274  hit = hit.replaceAll("^" + KeywordSearchList.BOUNDARY_CHARACTERS + "*", "");
275  hit = hit.replaceAll(KeywordSearchList.BOUNDARY_CHARACTERS + "*$", "");
276  }
277 
286  hit = hit.intern();
287 
288  // We will only create one KeywordHit instance per document for
289  // a given hit.
290  if (keywordsFoundInThisDocument.containsKey(hit)) {
291  continue;
292  }
293  keywordsFoundInThisDocument.put(hit, hit);
294 
295  if (artifactAttributeType == null) {
296  hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
297  } else {
298  switch (artifactAttributeType) {
299  case TSK_EMAIL:
300  /*
301  * Reduce false positives by eliminating email
302  * address hits that are either too short or are not
303  * for valid top level domains.
304  */
305  if (hit.length() >= MIN_EMAIL_ADDR_LENGTH
306  && DomainValidator.getInstance(true).isValidTld(hit.substring(hit.lastIndexOf('.')))) {
307  hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
308  }
309 
310  break;
311  case TSK_CARD_NUMBER:
312  /*
313  * If searching for credit card account numbers, do
314  * extra validation on the term and discard it if it
315  * does not pass.
316  */
317  Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
318 
319  for (int rLength = hit.length(); rLength >= 12; rLength--) {
320  ccnMatcher.region(0, rLength);
321  if (ccnMatcher.find()) {
322  final String group = ccnMatcher.group("ccn");
323  if (CreditCardValidator.isValidCCN(group)) {
324  hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
325  }
326  }
327  }
328 
329  break;
330  default:
331  hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
332  break;
333  }
334  }
335  }
336 
337  } catch (Throwable error) {
338  /*
339  * NOTE: Matcher.find() is known to throw StackOverflowError in rare
340  * cases (see JIRA-2700). StackOverflowError is an error, not an
341  * exception, and therefore needs to be caught as a Throwable. When
342  * this occurs we should re-throw the error as TskCoreException so
343  * that it is logged by the calling method and move on to the next
344  * Solr document.
345  */
346  throw new TskCoreException("Failed to create keyword hits for chunk due to " + error.getMessage());
347  }
348  return hits;
349  }
350 
356  static void cleanup(IngestJobContext context) {
357  Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(context.getJobId());
358  if (jobMap != null) {
359  jobMap.clear();
360  }
361  }
362 
369  static void makeArtifacts(IngestJobContext context) throws TskException {
370 
371  Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(context.getJobId());
372  if (jobMap == null) {
373  return;
374  }
375 
376  for (Map.Entry<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> mapBySource : jobMap.entrySet()) {
377  Long sourceId = mapBySource.getKey();
378  Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> mapByKeyword = mapBySource.getValue();
379 
380  for (Map.Entry<Keyword, Map<Keyword, List<UniqueKeywordHit>>> item : mapByKeyword.entrySet()) {
381  Keyword originalKeyword = item.getKey();
382  Map<Keyword, List<UniqueKeywordHit>> map = item.getValue();
383 
384  List<BlackboardArtifact> hitArtifacts = new ArrayList<>();
385  if (!map.isEmpty()) {
386  for (Map.Entry<Keyword, List<UniqueKeywordHit>> entry : map.entrySet()) {
387  Keyword hitKeyword = entry.getKey();
388  List<UniqueKeywordHit> hitList = entry.getValue();
389  // Only create one hit for the document.
390  // The first hit in the list should be the first one that
391  // was found.
392  if (!hitList.isEmpty()) {
393  UniqueKeywordHit hit = hitList.get(0);
394  SleuthkitCase tskCase = Case.getCurrentCase().getSleuthkitCase();
395  Content content = tskCase.getContentById(hit.getContentID());
396  BlackboardArtifact artifact;
397  if (hit.isLiteral() && hit.isWholeWord()) {
398  artifact = LuceneQuery.createKeywordHitArtifact(content, originalKeyword, hitKeyword, hit, hit.getSnippet(), hitKeyword.getListName(), sourceId);
399  } else {
400  artifact = RegexQuery.createKeywordHitArtifact(content, originalKeyword, hitKeyword, hit, hit.getSnippet(), hitKeyword.getListName(), sourceId);
401  }
402  // createKeywordHitArtifact has the potential to return null
403  // when a CCN account is created.
404  if (artifact != null) {
405  hitArtifacts.add(artifact);
406 
407  }
408 
409  }
410  }
411 
412  if (!hitArtifacts.isEmpty()) {
413  try {
414  SleuthkitCase tskCase = Case.getCurrentCaseThrows().getSleuthkitCase();
415  Blackboard blackboard = tskCase.getBlackboard();
416 
417  blackboard.postArtifacts(hitArtifacts, "KeywordSearch", context.getJobId());
418  hitArtifacts.clear();
419  } catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
420  logger.log(Level.SEVERE, "Failed to post KWH artifact to blackboard.", ex); //NON-NLS
421  }
422  }
423 
424  if (context.fileIngestIsCancelled()) {
425  return;
426  }
427  }
428  }
429  }
430  }
431 
444  public List<UniqueKeywordHit> getExactMatchHits(String text, Keyword originalKeyword, long sourceID, int chunkId, String keywordListName) throws IOException {
445  final HashMap<String, String> keywordsFoundInThisDocument = new HashMap<>();
446 
447  List<UniqueKeywordHit> hits = new ArrayList<>();
448  Analyzer analyzer = new StandardAnalyzer();
449 
450  //Get the tokens of the keyword
451  List<String> keywordTokens = new ArrayList<>();
452  try (TokenStream keywordstream = analyzer.tokenStream("field", originalKeyword.getSearchTerm())) {
453  CharTermAttribute attr = keywordstream.addAttribute(CharTermAttribute.class);
454  keywordstream.reset();
455  while (keywordstream.incrementToken()) {
456  keywordTokens.add(attr.toString());
457  }
458  }
459 
460  try (TokenStream stream = analyzer.tokenStream("field", text)) {
461  CharTermAttribute attr = stream.addAttribute(CharTermAttribute.class);
462  OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
463  stream.reset();
464  while (stream.incrementToken()) {
465  if (!attr.toString().equals(keywordTokens.get(0))) {
466  continue;
467  }
468 
469  int startOffset = offset.startOffset();
470  int endOffset = offset.endOffset();
471  boolean match = true;
472 
473  for (int index = 1; index < keywordTokens.size(); index++) {
474  if (stream.incrementToken()) {
475  if (!attr.toString().equals(keywordTokens.get(index))) {
476  match = false;
477  break;
478  } else {
479  endOffset = offset.endOffset();
480  }
481  }
482  }
483 
484  if (match) {
485  String hit = text.subSequence(startOffset, endOffset).toString();
486 
487  // We will only create one KeywordHit instance per document for
488  // a given hit.
489  if (keywordsFoundInThisDocument.containsKey(hit)) {
490  continue;
491  }
492  keywordsFoundInThisDocument.put(hit, hit);
493 
494  hits.add(new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(text, startOffset, endOffset, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getOriginalTerm()));
495  }
496  }
497  }
498 
499  return hits;
500  }
501 
510  static private Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> getMap(long jobId, long sourceID) {
511  Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(jobId);
512  if (jobMap == null) {
513  jobMap = new ConcurrentHashMap<>();
514  uniqueHitMap2.put(jobId, jobMap);
515  }
516 
517  Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> sourceMap = jobMap.get(sourceID);
518  if (sourceMap == null) {
519  sourceMap = new ConcurrentHashMap<>();
520  jobMap.put(sourceID, sourceMap);
521  }
522 
523  return sourceMap;
524  }
525 
526  // KeywordHit is not unique enough for finding duplicates, this class
527  // extends the KeywordHit class to make truely unique hits.
528  static class UniqueKeywordHit extends KeywordHit {
529 
530  private final String listName;
531  private final boolean isLiteral;
532  private final boolean isWholeWord;
533  private final BlackboardAttribute.ATTRIBUTE_TYPE artifactAtrributeType;
534  private final String originalSearchTerm;
535 
536  UniqueKeywordHit(int chunkId, long sourceID, String snippet, String hit, String listName, boolean isWholeWord, boolean isLiteral, BlackboardAttribute.ATTRIBUTE_TYPE artifactAtrributeType, String originalSearchTerm) {
537  super(chunkId, sourceID, snippet, hit);
538 
539  this.listName = listName;
540  this.isWholeWord = isWholeWord;
541  this.isLiteral = isLiteral;
542  this.artifactAtrributeType = artifactAtrributeType;
543  this.originalSearchTerm = originalSearchTerm;
544  }
545 
546  @Override
547  public int compareTo(KeywordHit other) {
548  return compare((UniqueKeywordHit) other);
549  }
550 
551  private int compare(UniqueKeywordHit other) {
552  return Comparator.comparing(UniqueKeywordHit::getSolrObjectId)
553  .thenComparing(UniqueKeywordHit::getChunkId)
554  .thenComparing(UniqueKeywordHit::getHit)
555  .thenComparing(UniqueKeywordHit::getSnippet)
556  .thenComparing(UniqueKeywordHit::isWholeWord)
557  .thenComparing(UniqueKeywordHit::isLiteral)
558  .thenComparing(UniqueKeywordHit::getArtifactAtrributeType)
559  .thenComparing(UniqueKeywordHit::getOriginalSearchTerm)
560  .thenComparing(UniqueKeywordHit::getListName)
561  .compare(this, other);
562  }
563 
564  @Override
565  public boolean equals(Object obj) {
566 
567  if (null == obj) {
568  return false;
569  }
570  if (getClass() != obj.getClass()) {
571  return false;
572  }
573  final UniqueKeywordHit other = (UniqueKeywordHit) obj;
574 
575  return getSnippet().equalsIgnoreCase(other.getSnippet())
576  && getSolrObjectId().equals(other.getSolrObjectId())
577  && getChunkId().equals(other.getChunkId())
578  && getHit().equalsIgnoreCase(other.getHit())
579  && listName.equalsIgnoreCase(other.getListName())
580  && isLiteral == other.isLiteral()
581  && isWholeWord == other.isWholeWord()
582  && originalSearchTerm.equalsIgnoreCase(other.getOriginalSearchTerm())
583  && (artifactAtrributeType != null ? artifactAtrributeType.equals(other.getArtifactAtrributeType()) : true);
584  }
585 
586  @Override
587  public int hashCode() {
588  int hash = 3;
589  hash = 67 * hash + super.hashCode();
590  hash = 67 * hash + Objects.hashCode(this.listName);
591  hash = 67 * hash + (this.isLiteral ? 1 : 0);
592  hash = 67 * hash + (this.isWholeWord ? 1 : 0);
593  hash = 67 * hash + Objects.hashCode(this.artifactAtrributeType);
594  hash = 67 * hash + Objects.hashCode(this.originalSearchTerm);
595  return hash;
596  }
597 
598  String getListName() {
599  return listName;
600  }
601 
602  Boolean isLiteral() {
603  return isLiteral;
604  }
605 
606  Boolean isWholeWord() {
607  return isWholeWord;
608  }
609 
610  BlackboardAttribute.ATTRIBUTE_TYPE getArtifactAtrributeType() {
611  return artifactAtrributeType;
612  }
613 
614  String getOriginalSearchTerm() {
615  return originalSearchTerm;
616  }
617 
618  }
619 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Aug 1 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.