Autopsy  4.21.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
LanguageSpecificContentQueryHelper.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2019 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import org.apache.solr.client.solrj.SolrQuery;
22 import org.apache.solr.client.solrj.SolrRequest;
23 import org.apache.solr.client.solrj.response.QueryResponse;
24 import org.apache.solr.common.SolrDocument;
25 import org.apache.solr.common.SolrDocumentList;
28 import org.sleuthkit.datamodel.TskException;
29 
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.Optional;
36 import java.util.Set;
37 import java.util.stream.Collectors;
38 
42 final class LanguageSpecificContentQueryHelper {
43 
44  private LanguageSpecificContentQueryHelper() {}
45 
46  private static final List<Server.Schema> QUERY_FIELDS = new ArrayList<>();
47  private static final List<Server.Schema> LANGUAGE_SPECIFIC_CONTENT_FIELDS
48  = Collections.singletonList(Server.Schema.CONTENT_JA);
49  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
50 
51  static {
52  QUERY_FIELDS.add(Server.Schema.TEXT);
53  QUERY_FIELDS.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS);
54  }
55 
59  static class QueryResults {
60  List<SolrDocument> chunks = new ArrayList<>();
61  Map</* ID */ String, SolrDocument> miniChunks = new HashMap<>();
62  // objectId_chunk -> "text" -> List of previews
63  Map<String, Map<String, List<String>>> highlighting = new HashMap<>();
64  }
65 
72  static String expandQueryString(final String queryStr) {
73  List<String> fieldQueries = new ArrayList<>();
74  fieldQueries.add(Server.Schema.TEXT.toString() + ":" + queryStr);
75  fieldQueries.addAll(LANGUAGE_SPECIFIC_CONTENT_FIELDS.stream().map(field -> field.toString() + ":" + queryStr).collect(Collectors.toList()));
76  return String.join(" OR ", fieldQueries);
77  }
78 
79  static List<Server.Schema> getQueryFields() {
80  return QUERY_FIELDS;
81  }
82 
83  static void updateQueryResults(QueryResults results, SolrDocument document) {
84  String id = (String) document.getFieldValue(Server.Schema.ID.toString());
85  if (MiniChunkHelper.isMiniChunkID(id)) {
86  results.miniChunks.put(MiniChunkHelper.getBaseChunkID(id), document);
87  } else {
88  results.chunks.add(document);
89  }
90  }
91 
99  static Optional<List<String>> getHighlights(Map<String, List<String>> highlight) {
100  for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
101  if (highlight.containsKey(field.toString())) {
102  return Optional.of(highlight.get(field.toString()));
103  }
104  }
105  return Optional.empty();
106  }
107 
113  static List<KeywordHit> mergeKeywordHits(List<KeywordHit> matches, Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
114  Map<String, KeywordHit> map = findMatches(originalKeyword, queryResults).stream().collect(Collectors.toMap(KeywordHit::getSolrDocumentId, x -> x));
115  List<KeywordHit> merged = new ArrayList<>();
116 
117  // first, replace KeywordHit in matches
118  for (KeywordHit match : matches) {
119  String key = match.getSolrDocumentId();
120  if (map.containsKey(key)) {
121  merged.add(map.get(key));
122  map.remove(key);
123  } else {
124  merged.add(match);
125  }
126  }
127  // second, add rest of KeywordHits from queryResults
128  merged.addAll(map.values());
129 
130  return merged;
131  }
132 
133  static void configureTermfreqQuery(SolrQuery query, String keyword) throws KeywordSearchModuleException, NoOpenCoreException {
134  // make a request to Solr to parse query.
135  QueryTermHelper.Result queryParserResult = QueryTermHelper.parse(keyword, LANGUAGE_SPECIFIC_CONTENT_FIELDS);
136  query.addField(buildTermfreqQuery(keyword, queryParserResult));
137  }
138 
139  static String buildTermfreqQuery(String keyword, QueryTermHelper.Result result) {
140  List<String> termfreqs = new ArrayList<>();
141  for (Map.Entry<String, List<String>> e : result.fieldTermsMap.entrySet()) {
142  String field = e.getKey();
143  for (String term : e.getValue()) {
144  termfreqs.add(String.format("termfreq(\"%s\",\"%s\")", field, KeywordSearchUtil.escapeLuceneQuery(term)));
145  }
146  }
147 
148  // sum of all language specific query fields.
149  // only one of these fields could be non-zero.
150  return String.format("termfreq:sum(%s)", String.join(",", termfreqs));
151  }
152 
153  static int queryChunkTermfreq(Set<String> keywords, String contentID) throws KeywordSearchModuleException, NoOpenCoreException {
154  SolrQuery q = new SolrQuery();
155  q.setShowDebugInfo(DEBUG);
156 
157  final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentID);
158  final String highlightQuery = keywords.stream()
159  .map(s -> LanguageSpecificContentQueryHelper.expandQueryString(
160  KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
161  .collect(Collectors.joining(" "));
162 
163  q.addFilterQuery(filterQuery);
164  q.setQuery(highlightQuery);
165  LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
166 
167  QueryResponse response = KeywordSearch.getServer().query(q, SolrRequest.METHOD.POST);
168  SolrDocumentList results = response.getResults();
169  if (results.isEmpty()) {
170  return 0;
171  }
172 
173  SolrDocument document = results.get(0);
174  return ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
175  }
176 
177  static int findNthIndexOf(String s, String pattern, int n) {
178  int found = 0;
179  int idx = -1;
180  int len = s.length();
181  while (idx < len && found <= n) {
182  idx = s.indexOf(pattern, idx + 1);
183  if (idx == -1) {
184  break;
185  }
186  found++;
187  }
188 
189  return idx;
190  }
191 
192  private static List<KeywordHit> findMatches(Keyword originalKeyword, QueryResults queryResults) throws KeywordSearchModuleException {
193  List<KeywordHit> matches = new ArrayList<>();
194  for (SolrDocument document : queryResults.chunks) {
195  String docId = (String) document.getFieldValue(Server.Schema.ID.toString());
196 
197  try {
198  int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
199  SolrDocument miniChunk = queryResults.miniChunks.get(docId);
200  if (miniChunk == null) {
201  // last chunk does not have mini chunk because there's no overlapped region with next one
202  matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
203  } else {
204  int hitCountInMiniChunk = ((Float) miniChunk.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
205  if (hitCountInMiniChunk < hitCountInChunk) {
206  // there are at least one hit in base chunk
207  matches.add(createKeywordHit(originalKeyword, queryResults.highlighting, docId));
208  }
209  }
210  } catch (TskException ex) {
211  throw new KeywordSearchModuleException(ex);
212  }
213  }
214  return matches;
215  }
216 
220  private static KeywordHit createKeywordHit(Keyword originalKeyword, Map<String, Map<String, List<String>>> highlightResponse, String docId) throws TskException {
225  String snippet = "";
226  if (KeywordSearchSettings.getShowSnippets()) {
227  List<String> snippetList = getHighlightFieldValue(highlightResponse.get(docId)).orElse(null);
228  // list is null if there wasn't a snippet
229  if (snippetList != null) {
230  snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
231  }
232  }
233 
234  return new KeywordHit(docId, snippet, originalKeyword.getSearchTerm());
235  }
236 
240  private static Optional<List<String>> getHighlightFieldValue(Map<String, List<String>> highlight) {
241  for (Server.Schema field : LANGUAGE_SPECIFIC_CONTENT_FIELDS) {
242  if (highlight.containsKey(field.toString())) {
243  return Optional.of(highlight.get(field.toString()));
244  }
245  }
246  return Optional.empty();
247  }
248 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Feb 6 2024
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.