19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.collect.Iterators;
22 import com.google.common.collect.Range;
23 import com.google.common.collect.TreeRangeSet;
24 import java.util.Arrays;
25 import java.util.Collection;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
31 import java.util.TreeMap;
32 import java.util.logging.Level;
33 import java.util.stream.Collectors;
34 import javax.annotation.concurrent.GuardedBy;
35 import org.apache.commons.text.StringEscapeUtils;
36 import org.apache.commons.lang.StringUtils;
37 import org.apache.commons.lang3.math.NumberUtils;
38 import org.apache.solr.client.solrj.SolrQuery;
39 import org.apache.solr.client.solrj.SolrRequest.METHOD;
40 import org.apache.solr.client.solrj.response.QueryResponse;
41 import org.apache.solr.common.SolrDocument;
42 import org.apache.solr.common.SolrDocumentList;
43 import org.openide.util.NbBundle;
55 class HighlightedText
implements ExtractedText {
57 private static final Logger logger = Logger.getLogger(HighlightedText.class.getName());
59 private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
61 private static final BlackboardAttribute.Type TSK_KEYWORD_SEARCH_TYPE =
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_SEARCH_TYPE);
62 private static final BlackboardAttribute.Type TSK_KEYWORD =
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD);
63 static private final BlackboardAttribute.Type TSK_ASSOCIATED_ARTIFACT =
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT);
64 static private final BlackboardAttribute.Type TSK_KEYWORD_REGEXP =
new BlackboardAttribute.Type(BlackboardAttribute.ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP);
66 private static final String HIGHLIGHT_PRE =
"<span style='background:yellow'>";
67 private static final String HIGHLIGHT_POST =
"</span>";
68 private static final String ANCHOR_PREFIX = HighlightedText.class.getName() +
"_";
70 final private Server solrServer = KeywordSearch.getServer();
72 private final long solrObjectId;
76 private final Set<String> keywords =
new HashSet<>();
78 private int numberPages;
79 private Integer currentPage = 0;
82 private
boolean isPageInfoLoaded = false;
87 private final TreeMap<Integer, Integer> numberOfHitsPerPage = new TreeMap<>();
92 private final Set<Integer> pages = numberOfHitsPerPage.keySet();
96 private final HashMap<Integer, Integer> currentHitPerPage = new HashMap<>();
98 private QueryResults hits = null;
99 private BlackboardArtifact artifact;
100 private KeywordSearch.QueryType qt;
101 private
boolean isLiteral;
114 HighlightedText(
long solrObjectId, QueryResults hits) {
115 this.solrObjectId = solrObjectId;
127 HighlightedText(BlackboardArtifact artifact)
throws TskCoreException {
128 this.artifact = artifact;
129 BlackboardAttribute attribute = artifact.getAttribute(TSK_ASSOCIATED_ARTIFACT);
130 if (attribute != null) {
131 this.solrObjectId = attribute.getValueLong();
133 this.solrObjectId = artifact.getObjectID();
142 synchronized private void loadPageInfo() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
143 if (isPageInfoLoaded) {
147 this.numberPages = solrServer.queryNumFileChunks(this.solrObjectId);
149 if (artifact != null) {
150 loadPageInfoFromArtifact();
151 }
else if (numberPages != 0) {
153 loadPageInfoFromHits();
156 this.numberPages = 1;
157 this.currentPage = 1;
158 numberOfHitsPerPage.put(1, 0);
159 currentHitPerPage.put(1, 0);
160 isPageInfoLoaded =
true;
170 synchronized private void loadPageInfoFromArtifact() throws TskCoreException, KeywordSearchModuleException, NoOpenCoreException {
171 final String keyword = artifact.getAttribute(TSK_KEYWORD).getValueString();
172 this.keywords.add(keyword);
175 final BlackboardAttribute queryTypeAttribute = artifact.getAttribute(TSK_KEYWORD_SEARCH_TYPE);
176 qt = (queryTypeAttribute != null)
177 ? KeywordSearch.QueryType.values()[queryTypeAttribute.getValueInt()] : null;
179 Keyword keywordQuery = null;
183 keywordQuery =
new Keyword(keyword,
true,
true);
186 String regexp = artifact.getAttribute(TSK_KEYWORD_REGEXP).getValueString();
187 keywordQuery =
new Keyword(regexp,
false,
false);
190 KeywordSearchQuery chunksQuery = KeywordSearchUtil.getQueryForKeyword(keywordQuery,
new KeywordList(Arrays.asList(keywordQuery)));
194 chunksQuery.addFilter(
new KeywordQueryFilter(FilterType.CHUNK,
this.solrObjectId));
196 hits = chunksQuery.performQuery();
197 loadPageInfoFromHits();
203 synchronized private void loadPageInfoFromHits() {
204 isLiteral = hits.getQuery().isLiteral();
212 for (Keyword k : hits.getKeywords()) {
213 for (KeywordHit hit : hits.getResults(k)) {
214 int chunkID = hit.getChunkId();
215 if (artifact != null) {
216 if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
217 String hit1 = hit.getHit();
218 if (keywords.stream().anyMatch(hit1::contains)) {
219 numberOfHitsPerPage.put(chunkID, 0);
220 currentHitPerPage.put(chunkID, 0);
225 if (chunkID != 0 && this.solrObjectId == hit.getSolrObjectId()) {
227 numberOfHitsPerPage.put(chunkID, 0);
228 currentHitPerPage.put(chunkID, 0);
230 if (StringUtils.isNotBlank(hit.getHit())) {
231 this.keywords.add(hit.getHit());
239 this.currentPage = pages.stream().findFirst().orElse(1);
241 isPageInfoLoaded =
true;
252 static private String constructEscapedSolrQuery(String query) {
253 return LuceneQuery.HIGHLIGHT_FIELD +
":" +
"\"" + KeywordSearchUtil.escapeLuceneQuery(query) +
"\"";
256 private int getIndexOfCurrentPage() {
257 return Iterators.indexOf(pages.iterator(), this.currentPage::equals);
261 public int getNumberPages() {
263 return this.numberPages;
267 public int getCurrentPage() {
268 return this.currentPage;
272 public boolean hasNextPage() {
273 return getIndexOfCurrentPage() < pages.size() - 1;
277 public boolean hasPreviousPage() {
278 return getIndexOfCurrentPage() > 0;
282 public int nextPage() {
284 currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() + 1);
287 throw new IllegalStateException(
"No next page.");
292 public int previousPage() {
293 if (hasPreviousPage()) {
294 currentPage = Iterators.get(pages.iterator(), getIndexOfCurrentPage() - 1);
297 throw new IllegalStateException(
"No previous page.");
302 public boolean hasNextItem() {
303 if (!this.currentHitPerPage.containsKey(currentPage)) {
306 return this.currentHitPerPage.get(currentPage) < this.numberOfHitsPerPage.get(currentPage);
310 public boolean hasPreviousItem() {
311 if (!this.currentHitPerPage.containsKey(currentPage)) {
314 return this.currentHitPerPage.get(currentPage) > 1;
318 public int nextItem() {
319 if (!hasNextItem()) {
320 throw new IllegalStateException(
"No next item.");
322 int cur = currentHitPerPage.get(currentPage) + 1;
323 currentHitPerPage.put(currentPage, cur);
328 public int previousItem() {
329 if (!hasPreviousItem()) {
330 throw new IllegalStateException(
"No previous item.");
332 int cur = currentHitPerPage.get(currentPage) - 1;
333 currentHitPerPage.put(currentPage, cur);
338 public int currentItem() {
339 if (!this.currentHitPerPage.containsKey(currentPage)) {
342 return currentHitPerPage.get(currentPage);
346 public String getText() {
348 String highlightField =
"";
350 double indexSchemaVersion = NumberUtils.toDouble(solrServer.getIndexInfo().getSchemaVersion());
353 SolrQuery q =
new SolrQuery();
354 q.setShowDebugInfo(DEBUG);
356 String contentIdStr = Long.toString(this.solrObjectId);
357 if (numberPages != 0) {
358 chunkID = Integer.toString(this.currentPage);
359 contentIdStr +=
"0".equals(chunkID) ?
"" :
"_" + chunkID;
361 final String filterQuery = Server.Schema.ID.toString() +
":" + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
363 highlightField = LuceneQuery.HIGHLIGHT_FIELD;
365 if (2.2 <= indexSchemaVersion) {
367 final String highlightQuery = keywords.stream().map(s ->
368 LanguageSpecificContentQueryHelper.expandQueryString(KeywordSearchUtil.quoteQuery(KeywordSearchUtil.escapeLuceneQuery(s))))
369 .collect(Collectors.joining(
" OR "));
370 q.setQuery(highlightQuery);
371 for (Server.Schema field : LanguageSpecificContentQueryHelper.getQueryFields()) {
372 q.addField(field.toString());
373 q.addHighlightField(field.toString());
375 q.addField(Server.Schema.LANGUAGE.toString());
377 LanguageSpecificContentQueryHelper.configureTermfreqQuery(q, keywords.iterator().next());
378 q.addFilterQuery(filterQuery);
379 q.setHighlightFragsize(0);
382 final String highlightQuery = keywords.stream()
383 .map(HighlightedText::constructEscapedSolrQuery)
384 .collect(Collectors.joining(
" "));
386 q.setQuery(highlightQuery);
387 q.addField(highlightField);
388 q.addFilterQuery(filterQuery);
389 q.addHighlightField(highlightField);
390 q.setHighlightFragsize(0);
394 if (shouldUseOriginalHighlighter(filterQuery)) {
396 q.setParam(
"hl.useFastVectorHighlighter",
"off");
397 q.setParam(
"hl.simple.pre", HIGHLIGHT_PRE);
398 q.setParam(
"hl.simple.post", HIGHLIGHT_POST);
400 q.setParam(
"hl.useFastVectorHighlighter",
"on");
401 q.setParam(
"hl.tag.pre", HIGHLIGHT_PRE);
402 q.setParam(
"hl.tag.post", HIGHLIGHT_POST);
403 q.setParam(
"hl.fragListBuilder",
"single");
407 q.setParam(
"hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED);
413 q.setQuery(filterQuery);
414 q.addField(highlightField);
417 QueryResponse response = solrServer.query(q, METHOD.POST);
422 if (response.getResults().size() > 1) {
423 logger.log(Level.WARNING,
"Unexpected number of results for Solr highlighting query: {0}", q);
425 String highlightedContent;
426 Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
428 if (responseHighlight == null) {
429 highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
431 Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
433 if (responseHighlightID == null) {
434 highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
436 SolrDocument document = response.getResults().get(0);
437 Object language = document.getFieldValue(Server.Schema.LANGUAGE.toString());
438 if (2.2 <= indexSchemaVersion && language != null) {
439 List<String> contentHighlights = LanguageSpecificContentQueryHelper.getHighlights(responseHighlightID).orElse(null);
440 if (contentHighlights == null) {
441 highlightedContent =
"";
443 int hitCountInMiniChunk = LanguageSpecificContentQueryHelper.queryChunkTermfreq(keywords, MiniChunkHelper.getChunkIdString(contentIdStr));
444 String s = contentHighlights.get(0).trim();
446 if (0 < hitCountInMiniChunk) {
447 int hitCountInChunk = ((Float) document.getFieldValue(Server.Schema.TERMFREQ.toString())).intValue();
448 int idx = LanguageSpecificContentQueryHelper.findNthIndexOf(
452 hitCountInChunk - hitCountInMiniChunk);
454 highlightedContent = s.substring(0, idx);
456 highlightedContent = s;
459 highlightedContent = s;
463 List<String> contentHighlights = responseHighlightID.get(LuceneQuery.HIGHLIGHT_FIELD);
464 if (contentHighlights == null) {
465 highlightedContent = attemptManualHighlighting(response.getResults(), highlightField, keywords);
468 highlightedContent = contentHighlights.get(0).trim();
473 highlightedContent = insertAnchors(highlightedContent);
475 return "<html><pre>" + highlightedContent +
"</pre></html>";
476 }
catch (TskCoreException | KeywordSearchModuleException | NoOpenCoreException ex) {
477 logger.log(Level.SEVERE,
"Error getting highlighted text for Solr doc id " + solrObjectId +
", chunkID " + chunkID +
", highlight query: " + highlightField, ex);
478 return Bundle.ExtractedText_errorMessage_errorGettingText();
483 public String toString() {
484 return NbBundle.getMessage(this.getClass(),
"HighlightedMatchesSource.toString");
488 public boolean isSearchable() {
493 public String getAnchorPrefix() {
494 return ANCHOR_PREFIX;
498 public int getNumberHits() {
499 if (!this.numberOfHitsPerPage.containsKey(
this.currentPage)) {
502 return this.numberOfHitsPerPage.get(this.currentPage);
520 static String attemptManualHighlighting(SolrDocumentList solrDocumentList, String highlightField, Collection<String> keywords) {
521 if (solrDocumentList.isEmpty()) {
522 return Bundle.ExtractedText_errorMessage_errorGettingText();
528 String text = solrDocumentList.get(0).getOrDefault(highlightField,
"").toString();
535 text = StringEscapeUtils.escapeHtml4(text);
537 TreeRangeSet<Integer> highlights = TreeRangeSet.create();
540 for (String keyword : keywords) {
542 final String escapedKeyword = StringEscapeUtils.escapeHtml4(keyword);
543 int searchOffset = 0;
544 int hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
545 while (hitOffset != -1) {
547 searchOffset = hitOffset + escapedKeyword.length();
550 highlights.add(Range.closedOpen(hitOffset, searchOffset));
553 hitOffset = StringUtils.indexOfIgnoreCase(text, escapedKeyword, searchOffset);
557 StringBuilder highlightedText =
new StringBuilder(text);
558 int totalHighLightLengthInserted = 0;
560 for (Range<Integer> highlightRange : highlights.asRanges()) {
561 int hStart = highlightRange.lowerEndpoint();
562 int hEnd = highlightRange.upperEndpoint();
565 highlightedText.insert(hStart + totalHighLightLengthInserted, HIGHLIGHT_PRE);
566 totalHighLightLengthInserted += HIGHLIGHT_PRE.length();
567 highlightedText.insert(hEnd + totalHighLightLengthInserted, HIGHLIGHT_POST);
568 totalHighLightLengthInserted += HIGHLIGHT_POST.length();
571 return highlightedText.toString();
582 private String insertAnchors(String searchableContent) {
583 StringBuilder buf =
new StringBuilder(searchableContent);
584 final String searchToken = HIGHLIGHT_PRE;
585 final int indexSearchTokLen = searchToken.length();
586 final String insertPre =
"<a name='" + ANCHOR_PREFIX;
587 final String insertPost =
"'></a>";
589 int searchOffset = 0;
590 int index = buf.indexOf(searchToken, searchOffset);
592 String insertString = insertPre + Integer.toString(count + 1) + insertPost;
593 int insertStringLen = insertString.length();
594 buf.insert(index, insertString);
595 searchOffset = index + indexSearchTokLen + insertStringLen;
597 index = buf.indexOf(searchToken, searchOffset);
601 this.numberOfHitsPerPage.put(this.currentPage, count);
602 if (this.currentItem() == 0 && this.hasNextItem()) {
606 return buf.toString();
624 private boolean shouldUseOriginalHighlighter(String filterQuery)
throws NoOpenCoreException, KeywordSearchModuleException {
625 final SolrQuery q =
new SolrQuery();
627 q.addFilterQuery(filterQuery);
628 q.setFields(Server.Schema.LANGUAGE.toString());
630 QueryResponse response = solrServer.query(q, METHOD.POST);
631 SolrDocumentList solrDocuments = response.getResults();
633 if (!solrDocuments.isEmpty()) {
634 SolrDocument solrDocument = solrDocuments.get(0);
635 if (solrDocument != null) {
636 Object languageField = solrDocument.getFieldValue(Server.Schema.LANGUAGE.toString());
637 if (languageField != null) {
638 return languageField.equals(
"ja");