Autopsy  4.4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
HighlightedText.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2017 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.collect.Iterators;
22 import com.google.common.collect.Range;
23 import com.google.common.collect.RangeSet;
24 import com.google.common.collect.TreeRangeSet;
25 import java.util.Arrays;
26 import java.util.Collection;
27 import java.util.HashMap;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.Set;
32 import java.util.TreeMap;
33 import java.util.logging.Level;
34 import java.util.stream.Collectors;
35 import javax.annotation.concurrent.GuardedBy;
36 import org.apache.commons.lang.StringEscapeUtils;
37 import org.apache.commons.lang.StringUtils;
38 import org.apache.commons.lang3.math.NumberUtils;
39 import org.apache.solr.client.solrj.SolrQuery;
40 import org.apache.solr.client.solrj.SolrRequest.METHOD;
41 import org.apache.solr.client.solrj.response.QueryResponse;
42 import org.apache.solr.common.SolrDocumentList;
43 import org.openide.util.NbBundle;
44 import org.openide.util.NbBundle.Messages;
48 import org.sleuthkit.datamodel.BlackboardArtifact;
49 import org.sleuthkit.datamodel.BlackboardAttribute;
50 import org.sleuthkit.datamodel.TskCoreException;
51 
56 class HighlightedText implements IndexedText {
57 
58  private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());
59 
60  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
61 
62  private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);
63  private static final BlackboardAttribute.Type TSK_KEYWORD = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);
64  static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);
65  static private final BlackboardAttribute.Type TSK_KEYWORD_REGEXP = new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP);
66 
67  private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
68  private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
69  private static final String ANCHOR_PREFIX = HighlightedText.class.getName() + "_"; //NON-NLS
70 
71  final private Server solrServer = KeywordSearch.getServer();
72 
73  private final long objectId;
74  /*
75  * The keywords to highlight
76  */
77  private final Set<String> keywords = new HashSet<>();
78 
79  private int numberPages;
80  private Integer currentPage = 0;
81 
82  @GuardedBy("this")
83  private boolean isPageInfoLoaded = false;
84 
85  /*
86  * map from page/chunk to number of hits. value is 0 if not yet known.
87  */
88  private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();
89  /*
90  * set of pages, used for iterating back and forth. Only stores pages with
91  * hits
92  */
93  private final Set<Integer> pages = numberOfHitsPerPage.keySet();
94  /*
95  * map from page/chunk number to current hit on that page.
96  */
97  private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
98 
99  private QueryResults hits = null; //original hits that may get passed in
100  private BlackboardArtifact artifact;
101  private KeywordSearch.QueryType qt;
102  private boolean isLiteral;
103 
115  HighlightedText(long objectId, QueryResults hits) {
116  this.objectId = objectId;
117  this.hits = hits;
118  }
119 
128  HighlightedText(BlackboardArtifact artifact) throws TskCoreException {
129  this.artifact = artifact;
130  BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);
131  if (attribute != null) {
132  this.objectId = attribute.getValueLong();
133  } else {
134  this.objectId = artifact.getObjectID();
135  }
136 
137  }
138 
143  @Messages({"HighlightedText.query.exception.msg=Could not perform the query to get chunk info and get highlights:"})
144  synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
145  if (isPageInfoLoaded) {
146  return;
147  }
148 
149  this.numberPages = solrServer.queryNumFileChunks(this.objectId);
150 
151  if (artifact != null) {
152  loadPageInfoFromArtifact();
153  } else if (numberPages != 0) {
154  // if the file has chunks, get pages with hits, sorted
155  loadPageInfoFromHits();
156  } else {
157  //non-artifact, no chunks, everything is easy.
158  this.numberPages = 1;
159  this.currentPage = 1;
160  numberOfHitsPerPage.put(1, 0);
161  pages.add(1);
162  currentHitPerPage.put(1, 0);
163  isPageInfoLoaded = true;
164  }
165  }
166 
173  synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
174  final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
175  this.keywords.add(keyword);
176 
177  //get the QueryType (if available)
178  final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
179  qt = (queryTypeAttribute != null)
180  ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;
181 
182  Keyword keywordQuery = null;
183  switch (qt) {
184  case LITERAL:
185  case SUBSTRING:
186  keywordQuery = new Keyword(keyword, true, true);
187  break;
188  case REGEX:
189  String regexp = artifact.getAttribute(TSK_KEYWORD_REGEXP).getValueString();
190  keywordQuery = new Keyword(regexp, false, false);
191  break;
192  }
193  KeywordSearchQuery chunksQuery = KeywordSearchUtil.getQueryForKeyword(keywordQuery, new KeywordList(Arrays.asList(keywordQuery)));
194  // Run a query to figure out which chunks for the current object have
195  // hits for this keyword.
196 
197  chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
198 
199  hits = chunksQuery.performQuery();
200  loadPageInfoFromHits();
201  }
202 
206  synchronized private void loadPageInfoFromHits() {
207  isLiteral = hits.getQuery().isLiteral();
208  //organize the hits by page, filter as needed
209  for (Keyword k : hits.getKeywords()) {
210  for (KeywordHit hit : hits.getResults(k)) {
211  int chunkID = hit.getChunkId();
212  if (artifact != null) {
213  if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
214  String hit1 = hit.getHit();
215  if (keywords.stream().anyMatch(hit1::contains)) {
216  numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
217  currentHitPerPage.put(chunkID, 0); //set current hit to 0th
218 
219  }
220  }
221  } else {
222  if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
223 
224  numberOfHitsPerPage.put(chunkID, 0); //unknown number of matches in the page
225  currentHitPerPage.put(chunkID, 0); //set current hit to 0th
226 
227  if (StringUtils.isNotBlank(hit.getHit())) {
228  this.keywords.add(hit.getHit());
229  }
230  }
231  }
232  }
233  }
234 
235  //set page to first page having highlights
236  this.currentPage = pages.stream().findFirst().orElse(1);
237 
238  isPageInfoLoaded = true;
239  }
240 
249  static private String constructEscapedSolrQuery(String query) {
250  return LuceneQuery.HIGHLIGHT_FIELD + ":" + "\"" + KeywordSearchUtil.escapeLuceneQuery(query) + "\"";
251  }
252 
253  private int getIndexOfCurrentPage() {
254  return Iterators.indexOf(pages.iterator(), this.currentPage::equals);
255  }
256 
257  @Override
258  public int getNumberPages() {
259  //return number of pages that have hits
260  return this.numberPages;
261  }
262 
263  @Override
264  public int getCurrentPage() {
265  return this.currentPage;
266  }
267 
268  @Override
269  public boolean hasNextPage() {
270  return getIndexOfCurrentPage() < pages.size() - 1;
271  }
272 
273  @Override
274  public boolean hasPreviousPage() {
275  return getIndexOfCurrentPage() > 0;
276  }
277 
278  @Override
279  public int nextPage() {
280  if (hasNextPage()) {
281  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);
282  return currentPage;
283  } else {
284  throw new IllegalStateException("No next page.");
285  }
286  }
287 
288  @Override
289  public int previousPage() {
290  if (hasPreviousPage()) {
291  currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);
292  return currentPage;
293  } else {
294  throw new IllegalStateException("No previous page.");
295  }
296  }
297 
298  @Override
299  public boolean hasNextItem() {
300  if (!this.currentHitPerPage.containsKey(currentPage)) {
301  return false;
302  }
303  return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
304  }
305 
306  @Override
307  public boolean hasPreviousItem() {
308  if (!this.currentHitPerPage.containsKey(currentPage)) {
309  return false;
310  }
311  return this.currentHitPerPage.get(currentPage) > 1;
312  }
313 
314  @Override
315  public int nextItem() {
316  if (!hasNextItem()) {
317  throw new IllegalStateException("No next item.");
318  }
319  int cur = currentHitPerPage.get(currentPage) + 1;
320  currentHitPerPage.put(currentPage, cur);
321  return cur;
322  }
323 
324  @Override
325  public int previousItem() {
326  if (!hasPreviousItem()) {
327  throw new IllegalStateException("No previous item.");
328  }
329  int cur = currentHitPerPage.get(currentPage) - 1;
330  currentHitPerPage.put(currentPage, cur);
331  return cur;
332  }
333 
334  @Override
335  public int currentItem() {
336  if (!this.currentHitPerPage.containsKey(currentPage)) {
337  return 0;
338  }
339  return currentHitPerPage.get(currentPage);
340  }
341 
342  @Override
343  public String getText() {
344  String chunkID = "";
345  String highlightField = "";
346  try {
347  loadPageInfo(); //inits once
348  SolrQuery q = new SolrQuery();
349  q.setShowDebugInfo(DEBUG); //debug
350 
351  String contentIdStr = Long.toString(this.objectId);
352  if (numberPages != 0) {
353  chunkID = Integer.toString(this.currentPage);
354  contentIdStr += "0".equals(chunkID) ? "" : "_" + chunkID;
355  }
356  final String filterQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
357 
358  double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
359  //choose field to highlight based on isLiteral and Solr index schema version.
360  highlightField = (isLiteral || (indexSchemaVersion < 2.0))
361  ? LuceneQuery.HIGHLIGHT_FIELD
362  : Server.Schema.CONTENT_STR.toString();
363  if (isLiteral) {
364  //if the query is literal try to get solr to do the highlighting
365  final String highlightQuery = keywords.stream()
366  .map(HighlightedText::constructEscapedSolrQuery)
367  .collect(Collectors.joining(" "));
368 
369  q.setQuery(highlightQuery);
370  q.addField(highlightField);
371  q.addFilterQuery(filterQuery);
372  q.addHighlightField(highlightField);
373  q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH
374 
375  //tune the highlighter
376  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
377  q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
378  q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
379  q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS
380 
381  //docs says makes sense for the original Highlighter only, but not really
382  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
383  } else {
384  /*
385  * if the query is not literal just pull back the text. We will
386  * do the highlighting in autopsy.
387  */
388  q.setQuery(filterQuery);
389  q.addField(highlightField);
390  }
391 
392  QueryResponse response = solrServer.query(q, METHOD.POST);
393 
394  // There should never be more than one document since there will
395  // either be a single chunk containing hits or we narrow our
396  // query down to the current page/chunk.
397  if (response.getResults().size() > 1) {
398  logger.log(Level.WARNING, "Unexpected number of results for Solr highlighting query: {0}", q); //NON-NLS
399  }
400  String highlightedContent;
401  Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
402 
403  if (responseHighlight == null) {
404  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
405  } else {
406  Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
407 
408  if (responseHighlightID == null) {
409  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
410  } else {
411  List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
412  if (contentHighlights == null) {
413  highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
414  } else {
415  // extracted content (minus highlight tags) is HTML-escaped
416  highlightedContent = contentHighlights.get(0).trim();
417  }
418  }
419  }
420  highlightedContent = insertAnchors(highlightedContent);
421 
422  return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
423  } catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {
424  logger.log(Level.SEVERE, "Error getting highlighted text for Solr doc id " + objectId + ", chunkID " + chunkID + ", highlight query: " + highlightField, ex); //NON-NLS
425  return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.queryFailedMsg");
426  }
427  }
428 
429  @Override
430  public String toString() {
431  return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.toString");
432  }
433 
434  @Override
435  public boolean isSearchable() {
436  return true;
437  }
438 
439  @Override
440  public String getAnchorPrefix() {
441  return ANCHOR_PREFIX;
442  }
443 
444  @Override
445  public int getNumberHits() {
446  if (!this.numberOfHitsPerPage.containsKey(this.currentPage)) {
447  return 0;
448  }
449  return this.numberOfHitsPerPage.get(this.currentPage);
450 
451  }
452 
466  static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {
467  if (solrDocumentList.isEmpty()) {
468  return NbBundle.getMessage(HighlightedText.class, "HighlightedMatchesSource.getMarkup.noMatchMsg");
469  }
470 
471  // It doesn't make sense for there to be more than a single document in
472  // the list since this class presents a single page (document) of highlighted
473  // content at a time. Hence we can just use get(0).
474  String text = solrDocumentList.get(0).getOrDefault(highlightField, "").toString();
475 
476  // Escape any HTML content that may be in the text. This is needed in
477  // order to correctly display the text in the content viewer.
478  // Must be done before highlighting tags are added. If we were to
479  // perform HTML escaping after adding the highlighting tags we would
480  // not see highlighted text in the content viewer.
481  text = StringEscapeUtils.escapeHtml(text);
482 
483  TreeRangeSet<Integer> highlights = TreeRangeSet.create();
484 
485  //for each keyword find the locations of hits and record them in the RangeSet
486  for (String keyword : keywords) {
487  //we also need to escape the keyword so that it matches the escaped text
488  final String escapedKeyword = StringEscapeUtils.escapeHtml(keyword);
489  int searchOffset = 0;
490  int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
491  while (hitOffset != -1) {
492  // Advance the search offset past the keyword.
493  searchOffset = hitOffset + escapedKeyword.length();
494 
495  //record the location of the hit, possibly merging it with other hits
496  highlights.add(Range.closedOpen(hitOffset, searchOffset));
497 
498  //look for next hit
499  hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
500  }
501  }
502 
503  StringBuilder highlightedText = new StringBuilder(text);
504  int totalHighLightLengthInserted = 0;
505  //for each range to be highlighted...
506  for (Range<Integer> highlightRange : highlights.asRanges()) {
507  int hStart = highlightRange.lowerEndpoint();
508  int hEnd = highlightRange.upperEndpoint();
509 
510  //insert the pre and post tag, adjusting indices for previously added tags
511  highlightedText.insert(hStart + totalHighLightLengthInserted, HIGHLIGHT_PRE);
512  totalHighLightLengthInserted += HIGHLIGHT_PRE.length();
513  highlightedText.insert(hEnd + totalHighLightLengthInserted, HIGHLIGHT_POST);
514  totalHighLightLengthInserted += HIGHLIGHT_POST.length();
515  }
516 
517  return highlightedText.toString();
518  }
519 
528  private String insertAnchors(String searchableContent) {
529  StringBuilder buf = new StringBuilder(searchableContent);
530  final String searchToken = HIGHLIGHT_PRE;
531  final int indexSearchTokLen = searchToken.length();
532  final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS
533  final String insertPost = "'></a>"; //NON-NLS
534  int count = 0;
535  int searchOffset = 0;
536  int index = buf.indexOf(searchToken, searchOffset);
537  while (index >= 0) {
538  String insertString = insertPre + Integer.toString(count + 1) + insertPost;
539  int insertStringLen = insertString.length();
540  buf.insert(index, insertString);
541  searchOffset = index + indexSearchTokLen + insertStringLen; //next offset past this anchor
542  ++count;
543  index = buf.indexOf(searchToken, searchOffset);
544  }
545 
546  //store total hits for this page, now that we know it
547  this.numberOfHitsPerPage.put(this.currentPage, count);
548  if (this.currentItem() == 0 && this.hasNextItem()) {
549  this.nextItem();
550  }
551 
552  return buf.toString();
553  }
554 
555 }

Copyright © 2012-2016 Basis Technology. Generated on: Fri Sep 29 2017
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.