Autopsy  4.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
LuceneQuery.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.Collection;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.List;
26 import java.util.Map;
27 import java.util.Set;
28 import java.util.TreeSet;
29 import java.util.logging.Level;
30 import org.apache.solr.client.solrj.SolrQuery;
31 import org.apache.solr.client.solrj.SolrRequest.METHOD;
32 import org.apache.solr.client.solrj.response.QueryResponse;
33 import org.apache.solr.common.SolrDocument;
34 import org.apache.solr.common.SolrDocumentList;
35 import org.openide.util.NbBundle;
41 import org.sleuthkit.datamodel.BlackboardArtifact;
42 import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
43 import org.sleuthkit.datamodel.BlackboardAttribute;
44 import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
45 import org.sleuthkit.datamodel.SleuthkitCase;
46 import org.sleuthkit.datamodel.TskException;
47 
52 class LuceneQuery implements KeywordSearchQuery {
53 
54  private static final Logger logger = Logger.getLogger(LuceneQuery.class.getName());
55  private final String keywordString; //original unescaped query
56  private String keywordStringEscaped;
57  private boolean isEscaped;
58  private Keyword keyword = null;
59  private KeywordList keywordList = null;
60  private final List<KeywordQueryFilter> filters = new ArrayList<>();
61  private String field = null;
62  private static final int MAX_RESULTS = 20000;
63  static final int SNIPPET_LENGTH = 50;
64  //can use different highlight schema fields for regex and literal search
65  static final String HIGHLIGHT_FIELD_LITERAL = Server.Schema.TEXT.toString();
66  static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.TEXT.toString();
67  //TODO use content_ws stored="true" in solr schema for perfect highlight hits
68  //static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.CONTENT_WS.toString()
69 
70  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
71 
77  public LuceneQuery(KeywordList keywordList, Keyword keyword) {
78  this.keywordList = keywordList;
79  this.keyword = keyword;
80 
81  // @@@ BC: Long-term, we should try to get rid of this string and use only the
82  // keyword object. Refactoring did not make its way through this yet.
83  this.keywordString = keyword.getSearchTerm();
84  this.keywordStringEscaped = this.keywordString;
85  }
86 
87  @Override
88  public void addFilter(KeywordQueryFilter filter) {
89  this.filters.add(filter);
90  }
91 
92  @Override
93  public void setField(String field) {
94  this.field = field;
95  }
96 
97  @Override
98  public void setSubstringQuery() {
99  // Note that this is not a full substring search. Normally substring
100  // searches will be done with TermComponentQuery objects instead.
101  keywordStringEscaped = keywordStringEscaped + "*";
102  }
103 
104  @Override
105  public void escape() {
106  keywordStringEscaped = KeywordSearchUtil.escapeLuceneQuery(keywordString);
107  isEscaped = true;
108  }
109 
110  @Override
111  public boolean isEscaped() {
112  return isEscaped;
113  }
114 
115  @Override
116  public boolean isLiteral() {
117  return true;
118  }
119 
120  @Override
121  public String getEscapedQueryString() {
122  return this.keywordStringEscaped;
123  }
124 
125  @Override
126  public String getQueryString() {
127  return this.keywordString;
128  }
129 
130  @Override
131  public QueryResults performQuery() throws NoOpenCoreException {
132  QueryResults results = new QueryResults(this, keywordList);
133  //in case of single term literal query there is only 1 term
134  boolean showSnippets = KeywordSearchSettings.getShowSnippets();
135  results.addResult(new Keyword(keywordString, true), performLuceneQuery(showSnippets));
136 
137  return results;
138  }
139 
140  @Override
141  public boolean validate() {
142  return keywordString != null && !keywordString.equals("");
143  }
144 
145  @Override
146  public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String termHit, KeywordHit hit, String snippet, String listName) {
147  final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();
148 
149  Collection<BlackboardAttribute> attributes = new ArrayList<>();
150  BlackboardArtifact bba;
151  KeywordCachedArtifact writeResult;
152  try {
153  bba = hit.getContent().newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
154  writeResult = new KeywordCachedArtifact(bba);
155  } catch (Exception e) {
156  logger.log(Level.WARNING, "Error adding bb artifact for keyword hit", e); //NON-NLS
157  return null;
158  }
159 
160  if (snippet != null) {
161  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet));
162  }
163  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, termHit));
164  if ((listName != null) && (listName.equals("") == false)) {
165  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName));
166  }
167 
168  //bogus - workaround the dir tree table issue
169  //attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP.getTypeID(), MODULE_NAME, "", ""));
170  //selector
171  if (keyword != null) {
172  BlackboardAttribute.ATTRIBUTE_TYPE selType = keyword.getArtifactAttributeType();
173  if (selType != null) {
174  attributes.add(new BlackboardAttribute(selType, MODULE_NAME, termHit));
175  }
176  }
177 
178  if (hit.isArtifactHit()) {
179  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, hit.getArtifact().getArtifactID()));
180  }
181 
182  try {
183  bba.addAttributes(attributes); //write out to bb
184  writeResult.add(attributes);
185  return writeResult;
186  } catch (TskException e) {
187  logger.log(Level.WARNING, "Error adding bb attributes to artifact", e); //NON-NLS
188  }
189  return null;
190  }
191 
202  private List<KeywordHit> performLuceneQuery(boolean snippets) throws NoOpenCoreException {
203  List<KeywordHit> matches = new ArrayList<>();
204  boolean allMatchesFetched = false;
205  final Server solrServer = KeywordSearch.getServer();
206 
207  SolrQuery q = createAndConfigureSolrQuery(snippets);
208  QueryResponse response;
209  SolrDocumentList resultList;
210  Map<String, Map<String, List<String>>> highlightResponse;
211  Set<SolrDocument> uniqueSolrDocumentsWithHits;
212 
213  try {
214  response = solrServer.query(q, METHOD.POST);
215 
216  resultList = response.getResults();
217 
218  // objectId_chunk -> "text" -> List of previews
219  highlightResponse = response.getHighlighting();
220 
221  // get the unique set of files with hits
222  uniqueSolrDocumentsWithHits = filterOneHitPerDocument(resultList);
223  } catch (KeywordSearchModuleException ex) {
224  logger.log(Level.SEVERE, "Error executing Lucene Solr Query: " + keywordString, ex); //NON-NLS
225  MessageNotifyUtil.Notify.error(NbBundle.getMessage(Server.class, "Server.query.exception.msg", keywordString), ex.getCause().getMessage());
226  return matches;
227  }
228 
229  // cycle through results in sets of MAX_RESULTS
230  for (int start = 0; !allMatchesFetched; start = start + MAX_RESULTS) {
231  q.setStart(start);
232 
233  allMatchesFetched = start + MAX_RESULTS >= resultList.getNumFound();
234 
235  SleuthkitCase sleuthkitCase;
236  try {
237  sleuthkitCase = Case.getCurrentCase().getSleuthkitCase();
238  } catch (IllegalStateException ex) {
239  //no case open, must be just closed
240  return matches;
241  }
242 
243  for (SolrDocument resultDoc : uniqueSolrDocumentsWithHits) {
244  KeywordHit contentHit;
245  try {
246  contentHit = createKeywordtHit(resultDoc, highlightResponse, sleuthkitCase);
247  } catch (TskException ex) {
248  return matches;
249  }
250  matches.add(contentHit);
251  }
252  }
253  return matches;
254  }
255 
263  private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
264  SolrQuery q = new SolrQuery();
265  q.setShowDebugInfo(DEBUG); //debug
266  //set query, force quotes/grouping around all literal queries
267  final String groupedQuery = KeywordSearchUtil.quoteQuery(keywordStringEscaped);
268  String theQueryStr = groupedQuery;
269  if (field != null) {
270  //use the optional field
271  StringBuilder sb = new StringBuilder();
272  sb.append(field).append(":").append(groupedQuery);
273  theQueryStr = sb.toString();
274  }
275  q.setQuery(theQueryStr);
276  q.setRows(MAX_RESULTS);
277 
278  q.setFields(Server.Schema.ID.toString());
279 
280  for (KeywordQueryFilter filter : filters) {
281  q.addFilterQuery(filter.toString());
282  }
283 
284  if (snippets) {
285  q.addHighlightField(Server.Schema.TEXT.toString());
286  //q.setHighlightSimplePre("&laquo;"); //original highlighter only
287  //q.setHighlightSimplePost("&raquo;"); //original highlighter only
288  q.setHighlightSnippets(1);
289  q.setHighlightFragsize(SNIPPET_LENGTH);
290 
291  //tune the highlighter
292  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
293  q.setParam("hl.tag.pre", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
294  q.setParam("hl.tag.post", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
295  q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS
296 
297  //Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
298  q.setParam("hl.fragCharSize", Integer.toString(theQueryStr.length())); //makes sense for FastVectorHighlighter only NON-NLS
299 
300  //docs says makes sense for the original Highlighter only, but not really
301  //analyze all content SLOW! consider lowering
302  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
303  }
304 
305  return q;
306  }
307 
316  private Set<SolrDocument> filterOneHitPerDocument(SolrDocumentList resultList) {
317  // sort the list so that we consistently pick the same chunk each time.
318  // note this sort is doing a string comparison and not an integer comparison, so
319  // chunk 10 will be smaller than chunk 9.
320  Collections.sort(resultList, new Comparator<SolrDocument>() {
321  @Override
322  public int compare(SolrDocument left, SolrDocument right) {
323  // ID is in the form of ObjectId_Chunk
324  String leftID = left.getFieldValue(Server.Schema.ID.toString()).toString();
325  String rightID = right.getFieldValue(Server.Schema.ID.toString()).toString();
326  return leftID.compareTo(rightID);
327  }
328  });
329 
330  // NOTE: We could probably just iterate through the list and compare each ID with the
331  // previous ID to get the unique documents faster than using this set now that the list
332  // is sorted.
333  Set<SolrDocument> solrDocumentsWithMatches = new TreeSet<>(new SolrDocumentComparatorIgnoresChunkId());
334  solrDocumentsWithMatches.addAll(resultList);
335  return solrDocumentsWithMatches;
336  }
337 
338  private KeywordHit createKeywordtHit(SolrDocument solrDoc, Map<String, Map<String, List<String>>> highlightResponse, SleuthkitCase caseDb) throws TskException {
343  final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
344  String snippet = "";
345  if (KeywordSearchSettings.getShowSnippets()) {
346  List<String> snippetList = highlightResponse.get(docId).get(Server.Schema.TEXT.toString());
347  // list is null if there wasn't a snippet
348  if (snippetList != null) {
349  snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
350  }
351  }
352  return new KeywordHit(docId, snippet);
353  }
354 
369  public static String querySnippet(String query, long solrObjectId, boolean isRegex, boolean group) throws NoOpenCoreException {
370  return querySnippet(query, solrObjectId, 0, isRegex, group);
371  }
372 
388  public static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {
389  Server solrServer = KeywordSearch.getServer();
390 
391  String highlightField;
392  if (isRegex) {
393  highlightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
394  } else {
395  highlightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
396  }
397 
398  SolrQuery q = new SolrQuery();
399 
400  String queryStr;
401 
402  if (isRegex) {
403  StringBuilder sb = new StringBuilder();
404  sb.append(highlightField).append(":");
405  if (group) {
406  sb.append("\"");
407  }
408  sb.append(query);
409  if (group) {
410  sb.append("\"");
411  }
412 
413  queryStr = sb.toString();
414  } else {
415  //simplify query/escaping and use default field
416  //always force grouping/quotes
417  queryStr = KeywordSearchUtil.quoteQuery(query);
418  }
419 
420  q.setQuery(queryStr);
421 
422  String contentIDStr;
423 
424  if (chunkID == 0) {
425  contentIDStr = Long.toString(solrObjectId);
426  } else {
427  contentIDStr = Server.getChunkIdString(solrObjectId, chunkID);
428  }
429 
430  String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);
431  q.setShowDebugInfo(DEBUG); //debug
432  q.addFilterQuery(idQuery);
433  q.addHighlightField(highlightField);
434  //q.setHighlightSimplePre("&laquo;"); //original highlighter only
435  //q.setHighlightSimplePost("&raquo;"); //original highlighter only
436  q.setHighlightSnippets(1);
437  q.setHighlightFragsize(SNIPPET_LENGTH);
438 
439  //tune the highlighter
440  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
441  q.setParam("hl.tag.pre", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
442  q.setParam("hl.tag.post", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
443  q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS
444 
445  //Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
446  q.setParam("hl.fragCharSize", Integer.toString(queryStr.length())); //makes sense for FastVectorHighlighter only NON-NLS
447 
448  //docs says makes sense for the original Highlighter only, but not really
449  //analyze all content SLOW! consider lowering
450  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
451 
452  try {
453  QueryResponse response = solrServer.query(q, METHOD.POST);
454  Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
455  Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIDStr);
456  if (responseHighlightID == null) {
457  return "";
458  }
459  List<String> contentHighlights = responseHighlightID.get(highlightField);
460  if (contentHighlights == null) {
461  return "";
462  } else {
463  // extracted content is HTML-escaped, but snippet goes in a plain text field
464  return EscapeUtil.unEscapeHtml(contentHighlights.get(0)).trim();
465  }
466  } catch (NoOpenCoreException ex) {
467  logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + query, ex); //NON-NLS
468  throw ex;
469  } catch (KeywordSearchModuleException ex) {
470  logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + query, ex); //NON-NLS
471  return "";
472  }
473  }
474 
475  @Override
476  public KeywordList getKeywordList() {
477  return keywordList;
478  }
479 
484  private class SolrDocumentComparatorIgnoresChunkId implements Comparator<SolrDocument> {
485 
486  @Override
487  public int compare(SolrDocument left, SolrDocument right) {
488  // ID is in the form of ObjectId_Chunk
489 
490  final String idName = Server.Schema.ID.toString();
491 
492  // get object id of left doc
493  String leftID = left.getFieldValue(idName).toString();
494  int index = leftID.indexOf(Server.CHUNK_ID_SEPARATOR);
495  if (index != -1) {
496  leftID = leftID.substring(0, index);
497  }
498 
499  // get object id of right doc
500  String rightID = right.getFieldValue(idName).toString();
501  index = rightID.indexOf(Server.CHUNK_ID_SEPARATOR);
502  if (index != -1) {
503  rightID = rightID.substring(0, index);
504  }
505 
506  Long leftLong = new Long(leftID);
507  Long rightLong = new Long(rightID);
508  return leftLong.compareTo(rightLong);
509  }
510  }
511 
512 }

Copyright © 2012-2016 Basis Technology. Generated on: Tue Oct 25 2016
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.