Autopsy  4.19.3
Graphical digital forensics platform for The Sleuth Kit and other tools.
HighlightedText.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2018 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.Iterators;
22 import com.google.common.collect.Range;
23 import com.google.common.collect.TreeRangeSet;
24 import java.util.Arrays;
25 import java.util.Collection;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.TreeMap;
32 import java.util.logging.Level;
33 import java.util.stream.Collectors;
34 import javax.annotation.concurrent.GuardedBy;
35 import org.apache.commons.text.StringEscapeUtils;
36 import org.apache.commons.lang.StringUtils;
37 import org.apache.commons.lang3.math.NumberUtils;
38 import org.apache.solr.client.solrj.SolrQuery;
39 import org.apache.solr.client.solrj.SolrRequest.METHOD;
40 import org.apache.solr.client.solrj.response.QueryResponse;
41 import org.apache.solr.common.SolrDocument;
42 import org.apache.solr.common.SolrDocumentList;
43 import org.openide.util.NbBundle;
47 import org.sleuthkit.datamodel.BlackboardArtifact;
48 import org.sleuthkit.datamodel.BlackboardAttribute;
49 import org.sleuthkit.datamodel.TskCoreException;
50 
55 class HighlightedText implements ExtractedText {
56 
57  private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());
58 
59  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
60 
61  private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);
62  private static final BlackboardAttribute.Type TSK_KEYWORD = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);
63  static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);
64  static private final BlackboardAttribute.Type TSK_KEYWORD_REGEXP = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP);
65 
66  private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
67  private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
68  private static final String ANCHOR_PREFIX = HighlightedText.class.getName() + "_"; //NON-NLS
69 
70  final private Server solrServer = KeywordSearch.getServer();
71 
72  private final long solrObjectId;
73  /*
74  * The keywords to highlight
75  */
76  private final Set<String> keywords = new HashSet<>();
77 
78  private int numberPages;
79  private Integer currentPage = 0;
80 
81  @GuardedBy("this")
82  private boolean isPageInfoLoaded = false;
83 
84  /*
85  * map from page/chunk to number of hits. value is 0 if not yet known.
86  */
87  private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();
88  /*
89  * set of pages, used for iterating back and forth. Only stores pages with
90  * hits
91  */
92  private final Set<Integer> pages = numberOfHitsPerPage.keySet();
93  /*
94  * map from page/chunk number to current hit on that page.
95  */
96  private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
97 
98  private QueryResults hits = null; //original hits that may get passed in
99  private BlackboardArtifact artifact;
100  private KeywordSearch.QueryType qt;
101  private boolean isLiteral;
102 
114  HighlightedText(long solrObjectId, QueryResults hits) {
115  this.solrObjectId = solrObjectId;
116  this.hits = hits;
117  }
118 
127  HighlightedText(BlackboardArtifact artifact) throws TskCoreException {
128  this.artifact = artifact;
129  BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);
130  if (attribute != null) {
131  this.solrObjectId = attribute.getValueLong();
132  } else {
133  this.solrObjectId = artifact.getObjectID();
134  }
135 
136  }
137 
142  synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
143  if (isPageInfoLoaded) {
144  return;
145  }
146 
147  this.numberPages = solrServer.queryNumFileChunks(this.solrObjectId);
148 
149  if (artifact != null) {
150  loadPageInfoFromArtifact();
151  } else if (numberPages != 0) {
152  // if the file has chunks, get pages with hits, sorted
153  loadPageInfoFromHits();
154  } else {
155  //non-artifact, no chunks, everything is easy.
156  this.numberPages = 1;
157  this.currentPage = 1;
158  numberOfHitsPerPage.put(1, 0);
159  currentHitPerPage.put(1, 0);
160  isPageInfoLoaded = true;
161  }
162  }
163 
170  synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
171  final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
172  this.keywords.add(keyword);
173 
174  //get the QueryType (if available)
175  final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
176  qt = (queryTypeAttribute != null)
177  ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;
178 
179  Keyword keywordQuery = null;
180  switch (qt) {
181  case LITERAL:
182  case SUBSTRING:
183  keywordQuery = new Keyword(keyword, true, true);
184  break;
185  case REGEX:
186  String regexp = artifact.getAttribute(TSK_KEYWORD_REGEXP).getValueString();
187  keywordQuery = new Keyword(regexp, false, false);
188  break;
189  }
190  KeywordSearchQuery chunksQuery = KeywordSearchUtil.getQueryForKeyword(keywordQuery, new KeywordList(Arrays.asList(keywordQuery)));
191  // Run a query to figure out which chunks for the current object have
192  // hits for this keyword.
193 
194  chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.solrObjectId));
195 
196  hits = chunksQuery.performQuery();
197  loadPageInfoFromHits();
198  }
199 
203  synchronized private void loadPageInfoFromHits() {
204  isLiteral = hits.getQuery().isLiteral();
205 
212  for (Keyword k : hits.getKeywords()) {
213  for (KeywordHit hit : hits.getResults(k)) {
214  int chunkID = hit.getChunkId();
215  if (artifact != null) {
216  if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
217  String hit1 = hit.getHit();
218  if (keywords.stream().anyMatch(hit1::contains)) {
219  numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
220  currentHitPerPage.put(chunkID, 0); //set current hit to 0th
221 
222  }
223  }
224  } else {
225  if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
226 
227  numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
228  currentHitPerPage.put(chunkID, 0); //set current hit to 0th
229 
230  if (StringUtils.isNotBlank(hit.getHit())) {
231  this.keywords.add(hit.getHit());
232  }
233  }
234  }
235  }
236  }
237 
238  //set page to first page having highlights
239  this.currentPage = pages.stream().findFirst().orElse(1);
240 
241  isPageInfoLoaded = true;
242  }
243 
252  static private String constructEscapedSolrQuery(String query) {
253  return LuceneQuery.HIGHLIGHT_FIELD + ":" + "\"" + KeywordSearchUtil.escapeLuceneQuery(query) + "\"";
254  }
255 
256  private int getIndexOfCurrentPage() {
257  return Iterators.indexOf(pages.iterator(), this.currentPage::equals);
258  }
259 
260  @Override
261  public int getNumberPages() {
262  //return number of pages that have hits
263  return this.numberPages;
264  }
265 
266  @Override
267  public int getCurrentPage() {
268  return this.currentPage;
269  }
270 
271  @Override
272  public boolean hasNextPage() {
273  return getIndexOfCurrentPage() < pages.size() - 1;
274  }
275 
276  @Override
277  public boolean hasPreviousPage() {
278  return getIndexOfCurrentPage() > 0;
279  }
280 
281  @Override
282  public int nextPage() {
283  if (hasNextPage()) {
284  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);
285  return currentPage;
286  } else {
287  throw new IllegalStateException("No next page.");
288  }
289  }
290 
291  @Override
292  public int previousPage() {
293  if (hasPreviousPage()) {
294  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);
295  return currentPage;
296  } else {
297  throw new IllegalStateException("No previous page.");
298  }
299  }
300 
301  @Override
302  public boolean hasNextItem() {
303  if (!this.currentHitPerPage.containsKey(currentPage)) {
304  return false;
305  }
306  return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
307  }
308 
309  @Override
310  public boolean hasPreviousItem() {
311  if (!this.currentHitPerPage.containsKey(currentPage)) {
312  return false;
313  }
314  return this.currentHitPerPage.get(currentPage) > 1;
315  }
316 
317  @Override
318  public int nextItem() {
319  if (!hasNextItem()) {
320  throw new IllegalStateException("No next item.");
321  }
322  int cur = currentHitPerPage.get(currentPage) + 1;
323  currentHitPerPage.put(currentPage, cur);
324  return cur;
325  }
326 
327  @Override
328  public int previousItem() {
329  if (!hasPreviousItem()) {
330  throw new IllegalStateException("No previous item.");
331  }
332  int cur = currentHitPerPage.get(currentPage) - 1;
333  currentHitPerPage.put(currentPage, cur);
334  return cur;
335  }
336 
337  @Override
338  public int currentItem() {
339  if (!this.currentHitPerPage.containsKey(currentPage)) {
340  return 0;
341  }
342  return currentHitPerPage.get(currentPage);
343  }
344 
345  @Override
346  public String getText() {
347  String chunkID = "";
348  String highlightField = "";
349  try {
350  double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
351 
352  loadPageInfo(); //inits once
353  SolrQuery q = new SolrQuery();
354  q.setShowDebugInfo(DEBUG); //debug
355 
356  String contentIdStr = Long.toString(this.solrObjectId);
357  if (numberPages != 0) {
358  chunkID = Integer.toString(this.currentPage);
359  contentIdStr += "0".equals(chunkID) ? "" : "_" + chunkID;
360  }
361  final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
362 
363  highlightField = LuceneQuery.HIGHLIGHT_FIELD;
364  if (isLiteral) {
365  if (2.2 <= indexSchemaVersion) {
366  //if the query is literal try to get solr to do the highlighting
367  final String highlightQuery = keywords.stream().map(s ->
368  LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
369  .collect(Collectors.joining(" OR "));
370  q.setQuery(highlightQuery);
371  for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
372  q.addField(field.toString());
373  q.addHighlightField(field.toString());
374  }
375  q.addField(Server.Schema.LANGUAGE.toString());
376  // in case of single term literal query there is only 1 term
377  LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
378  q.addFilterQuery(filterQuery);
379  q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
380  } else {
381  //if the query is literal try to get solr to do the highlighting
382  final String highlightQuery = keywords.stream()
383  .map(HighlightedText::constructEscapedSolrQuery)
384  .collect(Collectors.joining(" "));
385 
386  q.setQuery(highlightQuery);
387  q.addField(highlightField);
388  q.addFilterQuery(filterQuery);
389  q.addHighlightField(highlightField);
390  q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
391  }
392 
393  //tune the highlighter
394  if (shouldUseOriginalHighlighter(filterQuery)) {
395  // use original highlighter
396  q.setParam("hl.useFastVectorHighlighter", "off");
397  q.setParam("hl.simple.pre", HIGHLIGHT_PRE);
398  q.setParam("hl.simple.post", HIGHLIGHT_POST);
399  } else {
400  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
401  q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
402  q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
403  q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
404  }
405 
406  //docs says makes sense for the original Highlighter only, but not really
407  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
408  } else {
409  /*
410  * if the query is not literal just pull back the text. We will
411  * do the highlighting in autopsy.
412  */
413  q.setQuery(filterQuery);
414  q.addField(highlightField);
415  }
416 
417  QueryResponse response = solrServer.query(q, METHOD.POST);
418 
419  // There should never be more than one document since there will
420  // either be a single chunk containing hits or we narrow our
421  // query down to the current page/chunk.
422  if (response.getResults().size() > 1) {
423  logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", q); //NON-NLS
424  }
425  String highlightedContent;
426  Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
427 
428  if (responseHighlight == null) {
429  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
430  } else {
431  Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
432 
433  if (responseHighlightID == null) {
434  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
435  } else {
436  SolrDocument document = response.getResults().get(0);
437  Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
438  if (2.2 <= indexSchemaVersion && language != null) {
439  List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
440  if (contentHighlights == null) {
441  highlightedContent = "";
442  } else {
443  int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
444  String s = contentHighlights.get(0).trim();
445  // If there is a mini-chunk, trim the content not to show highlighted text in it.
446  if (0 < hitCountInMiniChunk) {
447  int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
448  int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
449  s,
450  HIGHLIGHT_PRE,
451  // trim after the last hit in chunk
452  hitCountInChunk - hitCountInMiniChunk);
453  if (idx != -1) {
454  highlightedContent = s.substring(0, idx);
455  } else {
456  highlightedContent = s;
457  }
458  } else {
459  highlightedContent = s;
460  }
461  }
462  } else {
463  List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
464  if (contentHighlights == null) {
465  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
466  } else {
467  // extracted content (minus highlight tags) is HTML-escaped
468  highlightedContent = contentHighlights.get(0).trim();
469  }
470  }
471  }
472  }
473  highlightedContent = insertAnchors(highlightedContent);
474 
475  return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
476  } catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {
477  logger.log(Level.SEVERE, "Error getting highlighted text for Solr doc id " + solrObjectId + ", chunkID " + chunkID + ", highlight query: " + highlightField, ex); //NON-NLS
478  return Bundle.ExtractedText_errorMessage_errorGettingText();
479  }
480  }
481 
482  @Override
483  public String toString() {
484  return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.toString");
485  }
486 
487  @Override
488  public boolean isSearchable() {
489  return true;
490  }
491 
492  @Override
493  public String getAnchorPrefix() {
494  return ANCHOR_PREFIX;
495  }
496 
497  @Override
498  public int getNumberHits() {
499  if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
500  return 0;
501  }
502  return this.numberOfHitsPerPage.get(this.currentPage);
503 
504  }
505 
520  static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {
521  if (solrDocumentList.isEmpty()) {
522  return Bundle.ExtractedText_errorMessage_errorGettingText();
523  }
524 
525  // It doesn't make sense for there to be more than a single document in
526  // the list since this class presents a single page (document) of highlighted
527  // content at a time. Hence we can just use get(0).
528  String text = solrDocumentList.get(0).getOrDefault(highlightField, "").toString();
529 
530  // Escape any HTML content that may be in the text. This is needed in
531  // order to correctly display the text in the content viewer.
532  // Must be done before highlighting tags are added. If we were to
533  // perform HTML escaping after adding the highlighting tags we would
534  // not see highlighted text in the content viewer.
535  text = StringEscapeUtils.escapeHtml4(text);
536 
537  TreeRangeSet<Integer> highlights = TreeRangeSet.create();
538 
539  //for each keyword find the locations of hits and record them in the RangeSet
540  for (String keyword : keywords) {
541  //we also need to escape the keyword so that it matches the escaped text
542  final String escapedKeyword = StringEscapeUtils.escapeHtml4(keyword);
543  int searchOffset = 0;
544  int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
545  while (hitOffset != -1) {
546  // Advance the search offset past the keyword.
547  searchOffset = hitOffset + escapedKeyword.length();
548 
549  //record the location of the hit, possibly merging it with other hits
550  highlights.add(Range.closedOpen(hitOffset, searchOffset));
551 
552  //look for next hit
553  hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
554  }
555  }
556 
557  StringBuilder highlightedText = new StringBuilder(text);
558  int totalHighLightLengthInserted = 0;
559  //for each range to be highlighted...
560  for (Range<Integer> highlightRange : highlights.asRanges()) {
561  int hStart = highlightRange.lowerEndpoint();
562  int hEnd = highlightRange.upperEndpoint();
563 
564  //insert the pre and post tag, adjusting indices for previously added tags
565  highlightedText.insert(hStart + totalHighLightLengthInserted, HIGHLIGHT_PRE);
566  totalHighLightLengthInserted += HIGHLIGHT_PRE.length();
567  highlightedText.insert(hEnd + totalHighLightLengthInserted, HIGHLIGHT_POST);
568  totalHighLightLengthInserted += HIGHLIGHT_POST.length();
569  }
570 
571  return highlightedText.toString();
572  }
573 
582  private String insertAnchors(String searchableContent) {
583  StringBuilder buf = new StringBuilder(searchableContent);
584  final String searchToken = HIGHLIGHT_PRE;
585  final int indexSearchTokLen = searchToken.length();
586  final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS
587  final String insertPost = "'></a>"; //NON-NLS
588  int count = 0;
589  int searchOffset = 0;
590  int index = buf.indexOf(searchToken, searchOffset);
591  while (index >= 0) {
592  String insertString = insertPre + Integer.toString(count + 1) + insertPost;
593  int insertStringLen = insertString.length();
594  buf.insert(index, insertString);
595  searchOffset = index + indexSearchTokLen + insertStringLen; //next offset past this anchor
596  ++count;
597  index = buf.indexOf(searchToken, searchOffset);
598  }
599 
600  //store total hits for this page, now that we know it
601  this.numberOfHitsPerPage.put(this.currentPage, count);
602  if (this.currentItem() == 0 && this.hasNextItem()) {
603  this.nextItem();
604  }
605 
606  return buf.toString();
607  }
608 
624  private boolean shouldUseOriginalHighlighter(String filterQuery) throws NoOpenCoreException, KeywordSearchModuleException {
625  final SolrQuery q = new SolrQuery();
626  q.setQuery("*:*");
627  q.addFilterQuery(filterQuery);
628  q.setFields(Server.Schema.LANGUAGE.toString());
629 
630  QueryResponse response = solrServer.query(q, METHOD.POST);
631  SolrDocumentList solrDocuments = response.getResults();
632 
633  if (!solrDocuments.isEmpty()) {
634  SolrDocument solrDocument = solrDocuments.get(0);
635  if (solrDocument != null) {
636  Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
637  if (languageField != null) {
638  return languageField.equals("ja");
639  }
640  }
641  }
642  return false;
643  }
644 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Jun 27 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.