19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.twelvemonkeys.lang.StringUtil;
22 import java.io.IOException;
23 import java.util.ArrayList;
24 import java.util.Comparator;
25 import java.util.HashMap;
26 import java.util.List;
28 import java.util.Objects;
29 import java.util.concurrent.ConcurrentHashMap;
30 import java.util.logging.Level;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33 import org.apache.commons.validator.routines.DomainValidator;
34 import org.apache.lucene.analysis.Analyzer;
35 import org.apache.lucene.analysis.TokenStream;
36 import org.apache.lucene.analysis.standard.StandardAnalyzer;
37 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
38 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
53 final class InlineSearcher {
55 private final List<KeywordList> keywordList;
56 private static final int MIN_EMAIL_ADDR_LENGTH = 8;
57 private static final Logger logger = Logger.getLogger(InlineSearcher.class.getName());
59 private final IngestJobContext context;
61 static final Map<Long, List<UniqueKeywordHit>> uniqueHitMap =
new ConcurrentHashMap<>();
63 static final Map<Long, Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>>> uniqueHitMap2 =
new ConcurrentHashMap<>();
67 InlineSearcher(List<String> keywordListNames, IngestJobContext context) {
68 this.keywordList =
new ArrayList<>();
69 this.context = context;
71 if (keywordListNames != null) {
72 XmlKeywordSearchList loader = XmlKeywordSearchList.getCurrent();
73 for (String name : keywordListNames) {
74 keywordList.add(loader.getList(name));
87 boolean searchChunk(Chunk chunk,
long sourceID,
int chunkId)
throws TskCoreException {
88 return searchString(chunk.getLowerCasedChunk(), sourceID, chunkId);
99 boolean searchString(String text,
long sourceID,
int chunkId)
throws TskCoreException {
100 boolean hitFound =
false;
101 Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> hitByKeyword = getMap(context.getJobId(), sourceID);
102 for (KeywordList list : keywordList) {
103 List<Keyword> keywords = list.getKeywords();
104 for (Keyword originalKeyword : keywords) {
105 Map<Keyword, List<UniqueKeywordHit>> hitMap = hitByKeyword.get(originalKeyword);
106 if (hitMap == null) {
107 hitMap =
new HashMap<>();
108 hitByKeyword.put(originalKeyword, hitMap);
111 List<UniqueKeywordHit> keywordHits =
new ArrayList<>();
112 if (originalKeyword.searchTermIsLiteral()) {
113 if (StringUtil.containsIgnoreCase(text, originalKeyword.getSearchTerm())) {
114 keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID, chunkId, list.getName()));
117 String regex = originalKeyword.getSearchTerm();
121 Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
122 Matcher matcher = pattern.matcher(text);
124 if (matcher.find()) {
125 keywordHits.addAll(createKeywordHits(text, originalKeyword, sourceID, chunkId, list.getName()));
127 }
catch (IllegalArgumentException ex) {
132 if (!keywordHits.isEmpty()) {
134 for (UniqueKeywordHit hit : keywordHits) {
135 Keyword keywordCopy =
new Keyword(hit.getHit(),
136 originalKeyword.searchTermIsLiteral(),
137 originalKeyword.searchTermIsWholeWord(),
139 originalKeyword.getOriginalTerm());
141 List<UniqueKeywordHit> mapHitList = hitMap.get(keywordCopy);
142 if (mapHitList == null) {
143 mapHitList =
new ArrayList<>();
144 hitMap.put(keywordCopy, mapHitList);
147 if (!mapHitList.contains(hit)) {
153 if (context.fileIngestIsCancelled()) {
172 private List<UniqueKeywordHit> createKeywordHits(String text, Keyword originalKeyword,
long sourceID,
int chunkId, String keywordListName)
throws TskCoreException {
174 if (originalKeyword.searchTermIsLiteral() && originalKeyword.searchTermIsWholeWord()) {
176 return getExactMatchHits(text, originalKeyword, sourceID, chunkId, keywordListName);
177 }
catch (IOException ex) {
178 throw new TskCoreException(
"Failed to create exactMatch hits", ex);
182 final HashMap<String, String> keywordsFoundInThisDocument =
new HashMap<>();
184 List<UniqueKeywordHit> hits =
new ArrayList<>();
185 String keywordString = originalKeyword.getSearchTerm();
187 boolean queryStringContainsWildcardSuffix = originalKeyword.getSearchTerm().endsWith(
".*");
189 String searchPattern;
190 if (originalKeyword.searchTermIsLiteral()) {
204 searchPattern =
"[\\w[\\.']]*" + java.util.regex.Pattern.quote(keywordString.toLowerCase()) +
"[\\w[\\.']]*";
207 searchPattern = keywordString;
210 final java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(searchPattern, Pattern.CASE_INSENSITIVE);
213 String content = text;
214 Matcher hitMatcher = pattern.matcher(content);
217 while (hitMatcher.find(offset)) {
219 String hit = hitMatcher.group().toLowerCase();
225 if (
"".equals(hit)) {
229 offset = hitMatcher.end();
230 final BlackboardAttribute.ATTRIBUTE_TYPE artifactAttributeType = originalKeyword.getArtifactAttributeType();
238 if (!queryStringContainsWildcardSuffix
239 && (artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PHONE_NUMBER
240 || artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_IP_ADDRESS)) {
241 if (artifactAttributeType == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_PHONE_NUMBER) {
243 hit = hit.replaceAll(
"^[^0-9\\(]",
"");
246 hit = hit.replaceAll(
"^[^0-9]",
"");
249 hit = hit.replaceAll(
"[^0-9]$",
"");
273 if (originalKeyword.searchTermIsLiteral()) {
274 hit = hit.replaceAll(
"^" + KeywordSearchList.BOUNDARY_CHARACTERS +
"*",
"");
275 hit = hit.replaceAll(KeywordSearchList.BOUNDARY_CHARACTERS +
"*$",
"");
290 if (keywordsFoundInThisDocument.containsKey(hit)) {
293 keywordsFoundInThisDocument.put(hit, hit);
295 if (artifactAttributeType == null) {
296 hits.add(
new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
298 switch (artifactAttributeType) {
305 if (hit.length() >= MIN_EMAIL_ADDR_LENGTH
306 && DomainValidator.getInstance(
true).isValidTld(hit.substring(hit.lastIndexOf(
'.')))) {
307 hits.add(
new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
311 case TSK_CARD_NUMBER:
317 Matcher ccnMatcher = CREDIT_CARD_NUM_PATTERN.matcher(hit);
319 for (
int rLength = hit.length(); rLength >= 12; rLength--) {
320 ccnMatcher.region(0, rLength);
321 if (ccnMatcher.find()) {
322 final String group = ccnMatcher.group(
"ccn");
323 if (CreditCardValidator.isValidCCN(group)) {
324 hits.add(
new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
331 hits.add(
new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(content, hitMatcher, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getSearchTerm()));
337 }
catch (Throwable error) {
346 throw new TskCoreException(
"Failed to create keyword hits for chunk due to " + error.getMessage());
356 static void cleanup(IngestJobContext context) {
357 Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(context.getJobId());
358 if (jobMap != null) {
369 static void makeArtifacts(IngestJobContext context)
throws TskException {
371 Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(context.getJobId());
372 if (jobMap == null) {
376 for (Map.Entry<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> mapBySource : jobMap.entrySet()) {
377 Long sourceId = mapBySource.getKey();
378 Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> mapByKeyword = mapBySource.getValue();
380 for (Map.Entry<Keyword, Map<Keyword, List<UniqueKeywordHit>>> item : mapByKeyword.entrySet()) {
381 Keyword originalKeyword = item.getKey();
382 Map<Keyword, List<UniqueKeywordHit>> map = item.getValue();
384 List<BlackboardArtifact> hitArtifacts =
new ArrayList<>();
385 if (!map.isEmpty()) {
386 for (Map.Entry<Keyword, List<UniqueKeywordHit>> entry : map.entrySet()) {
387 Keyword hitKeyword = entry.getKey();
388 List<UniqueKeywordHit> hitList = entry.getValue();
392 if (!hitList.isEmpty()) {
393 UniqueKeywordHit hit = hitList.get(0);
394 SleuthkitCase tskCase = Case.getCurrentCase().getSleuthkitCase();
395 Content content = tskCase.getContentById(hit.getContentID());
396 BlackboardArtifact artifact;
397 if (hit.isLiteral() && hit.isWholeWord()) {
398 artifact = LuceneQuery.createKeywordHitArtifact(content, originalKeyword, hitKeyword, hit, hit.getSnippet(), hitKeyword.getListName(), sourceId);
400 artifact = RegexQuery.createKeywordHitArtifact(content, originalKeyword, hitKeyword, hit, hit.getSnippet(), hitKeyword.getListName(), sourceId);
404 if (artifact != null) {
405 hitArtifacts.add(artifact);
412 if (!hitArtifacts.isEmpty()) {
414 SleuthkitCase tskCase = Case.getCurrentCaseThrows().getSleuthkitCase();
415 Blackboard blackboard = tskCase.getBlackboard();
417 blackboard.postArtifacts(hitArtifacts,
"KeywordSearch", context.getJobId());
418 hitArtifacts.clear();
419 }
catch (NoCurrentCaseException | Blackboard.BlackboardException ex) {
420 logger.log(Level.SEVERE,
"Failed to post KWH artifact to blackboard.", ex);
424 if (context.fileIngestIsCancelled()) {
444 public List<UniqueKeywordHit> getExactMatchHits(String text, Keyword originalKeyword,
long sourceID,
int chunkId, String keywordListName)
throws IOException {
445 final HashMap<String, String> keywordsFoundInThisDocument =
new HashMap<>();
447 List<UniqueKeywordHit> hits =
new ArrayList<>();
448 Analyzer analyzer =
new StandardAnalyzer();
451 List<String> keywordTokens =
new ArrayList<>();
452 try (TokenStream keywordstream = analyzer.tokenStream(
"field", originalKeyword.getSearchTerm())) {
453 CharTermAttribute attr = keywordstream.addAttribute(CharTermAttribute.class);
454 keywordstream.reset();
455 while (keywordstream.incrementToken()) {
456 keywordTokens.add(attr.toString());
460 try (TokenStream stream = analyzer.tokenStream(
"field", text)) {
461 CharTermAttribute attr = stream.addAttribute(CharTermAttribute.class);
462 OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
464 while (stream.incrementToken()) {
465 if (!attr.toString().equals(keywordTokens.get(0))) {
469 int startOffset = offset.startOffset();
470 int endOffset = offset.endOffset();
471 boolean match =
true;
473 for (
int index = 1; index < keywordTokens.size(); index++) {
474 if (stream.incrementToken()) {
475 if (!attr.toString().equals(keywordTokens.get(index))) {
479 endOffset = offset.endOffset();
485 String hit = text.subSequence(startOffset, endOffset).toString();
489 if (keywordsFoundInThisDocument.containsKey(hit)) {
492 keywordsFoundInThisDocument.put(hit, hit);
494 hits.add(
new UniqueKeywordHit(chunkId, sourceID, KeywordSearchUtil.makeSnippet(text, startOffset, endOffset, hit), hit, keywordListName, originalKeyword.searchTermIsWholeWord(), originalKeyword.searchTermIsLiteral(), originalKeyword.getArtifactAttributeType(), originalKeyword.getOriginalTerm()));
510 static private Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> getMap(
long jobId,
long sourceID) {
511 Map<Long, Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>>> jobMap = uniqueHitMap2.get(jobId);
512 if (jobMap == null) {
513 jobMap =
new ConcurrentHashMap<>();
514 uniqueHitMap2.put(jobId, jobMap);
517 Map<Keyword, Map<Keyword, List<UniqueKeywordHit>>> sourceMap = jobMap.get(sourceID);
518 if (sourceMap == null) {
519 sourceMap =
new ConcurrentHashMap<>();
520 jobMap.put(sourceID, sourceMap);
528 static class UniqueKeywordHit
extends KeywordHit {
530 private final String listName;
531 private final boolean isLiteral;
532 private final boolean isWholeWord;
533 private final BlackboardAttribute.ATTRIBUTE_TYPE artifactAtrributeType;
534 private final String originalSearchTerm;
536 UniqueKeywordHit(
int chunkId,
long sourceID, String snippet, String hit, String listName,
boolean isWholeWord,
boolean isLiteral, BlackboardAttribute.ATTRIBUTE_TYPE artifactAtrributeType, String originalSearchTerm) {
537 super(chunkId, sourceID, snippet, hit);
539 this.listName = listName;
540 this.isWholeWord = isWholeWord;
541 this.isLiteral = isLiteral;
542 this.artifactAtrributeType = artifactAtrributeType;
543 this.originalSearchTerm = originalSearchTerm;
547 public int compareTo(KeywordHit other) {
548 return compare((UniqueKeywordHit) other);
551 private int compare(UniqueKeywordHit other) {
552 return Comparator.comparing(UniqueKeywordHit::getSolrObjectId)
553 .thenComparing(UniqueKeywordHit::getChunkId)
554 .thenComparing(UniqueKeywordHit::getHit)
555 .thenComparing(UniqueKeywordHit::getSnippet)
556 .thenComparing(UniqueKeywordHit::isWholeWord)
557 .thenComparing(UniqueKeywordHit::isLiteral)
558 .thenComparing(UniqueKeywordHit::getArtifactAtrributeType)
559 .thenComparing(UniqueKeywordHit::getOriginalSearchTerm)
560 .thenComparing(UniqueKeywordHit::getListName)
561 .compare(
this, other);
565 public boolean equals(Object obj) {
570 if (getClass() != obj.getClass()) {
573 final UniqueKeywordHit other = (UniqueKeywordHit) obj;
575 return getSnippet().equalsIgnoreCase(other.getSnippet())
576 && getSolrObjectId().equals(other.getSolrObjectId())
577 && getChunkId().equals(other.getChunkId())
578 && getHit().equalsIgnoreCase(other.getHit())
579 && listName.equalsIgnoreCase(other.getListName())
580 && isLiteral == other.isLiteral()
581 && isWholeWord == other.isWholeWord()
582 && originalSearchTerm.equalsIgnoreCase(other.getOriginalSearchTerm())
583 && (artifactAtrributeType != null ? artifactAtrributeType.equals(other.getArtifactAtrributeType()) :
true);
587 public int hashCode() {
589 hash = 67 * hash + super.hashCode();
590 hash = 67 * hash + Objects.hashCode(this.listName);
591 hash = 67 * hash + (this.isLiteral ? 1 : 0);
592 hash = 67 * hash + (this.isWholeWord ? 1 : 0);
593 hash = 67 * hash + Objects.hashCode(this.artifactAtrributeType);
594 hash = 67 * hash + Objects.hashCode(this.originalSearchTerm);
598 String getListName() {
602 Boolean isLiteral() {
606 Boolean isWholeWord() {
610 BlackboardAttribute.ATTRIBUTE_TYPE getArtifactAtrributeType() {
611 return artifactAtrributeType;
614 String getOriginalSearchTerm() {
615 return originalSearchTerm;