Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
LuceneQuery.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2011-2015 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import java.util.ArrayList;
22 import java.util.Collection;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.List;
26 import java.util.Map;
27 import java.util.Set;
28 import java.util.TreeSet;
29 import java.util.logging.Level;
31 import org.apache.solr.client.solrj.SolrQuery;
32 import org.apache.solr.client.solrj.SolrRequest.METHOD;
33 import org.apache.solr.client.solrj.response.QueryResponse;
34 import org.apache.solr.common.SolrDocument;
35 import org.apache.solr.common.SolrDocumentList;
46 
51 class LuceneQuery implements KeywordSearchQuery {
52 
53  private static final Logger logger = Logger.getLogger(LuceneQuery.class.getName());
54  private final String keywordString; //original unescaped query
55  private String keywordStringEscaped;
56  private boolean isEscaped;
57  private Keyword keywordQuery = null;
58  private KeywordList keywordList = null;
59  private final List<KeywordQueryFilter> filters = new ArrayList<>();
60  private String field = null;
61  private static final int MAX_RESULTS = 20000;
62  static final int SNIPPET_LENGTH = 50;
63  //can use different highlight schema fields for regex and literal search
64  static final String HIGHLIGHT_FIELD_LITERAL = Server.Schema.TEXT.toString();
65  static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.TEXT.toString();
66  //TODO use content_ws stored="true" in solr schema for perfect highlight hits
67  //static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.CONTENT_WS.toString()
68 
69  private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
70 
76  public LuceneQuery(KeywordList keywordList, Keyword keywordQuery) {
77  this.keywordList = keywordList;
78  this.keywordQuery = keywordQuery;
79 
80  // @@@ BC: Long-term, we should try to get rid of this string and use only the
81  // keyword object. Refactoring did not make its way through this yet.
82  this.keywordString = keywordQuery.getQuery();
83  this.keywordStringEscaped = this.keywordString;
84  }
85 
86  @Override
87  public void addFilter(KeywordQueryFilter filter) {
88  this.filters.add(filter);
89  }
90 
91  @Override
92  public void setField(String field) {
93  this.field = field;
94  }
95 
96  @Override
97  public void setSubstringQuery() {
98  // Note that this is not a full substring search. Normally substring
99  // searches will be done with TermComponentQuery objects instead.
100  keywordStringEscaped = keywordStringEscaped + "*";
101  }
102 
103  @Override
104  public void escape() {
105  keywordStringEscaped = KeywordSearchUtil.escapeLuceneQuery(keywordString);
106  isEscaped = true;
107  }
108 
109  @Override
110  public boolean isEscaped() {
111  return isEscaped;
112  }
113 
114  @Override
115  public boolean isLiteral() {
116  return true;
117  }
118 
119  @Override
120  public String getEscapedQueryString() {
121  return this.keywordStringEscaped;
122  }
123 
124  @Override
125  public String getQueryString() {
126  return this.keywordString;
127  }
128 
129  @Override
130  public QueryResults performQuery() throws NoOpenCoreException {
131  QueryResults results = new QueryResults(this, keywordList);
132  //in case of single term literal query there is only 1 term
133  boolean showSnippets = KeywordSearchSettings.getShowSnippets();
134  results.addResult(new Keyword(keywordString, true), performLuceneQuery(showSnippets));
135 
136  return results;
137  }
138 
139  @Override
140  public boolean validate() {
141  return keywordString != null && !keywordString.equals("");
142  }
143 
144  @Override
145  public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String termHit, KeywordHit hit, String snippet, String listName) {
146  final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();
147 
148  Collection<BlackboardAttribute> attributes = new ArrayList<>();
149  BlackboardArtifact bba;
150  KeywordCachedArtifact writeResult;
151  try {
152  bba = hit.getContent().newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
153  writeResult = new KeywordCachedArtifact(bba);
154  } catch (Exception e) {
155  logger.log(Level.WARNING, "Error adding bb artifact for keyword hit", e); //NON-NLS
156  return null;
157  }
158 
159  if (snippet != null) {
160  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW.getTypeID(), MODULE_NAME, snippet));
161  }
162  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD.getTypeID(), MODULE_NAME, termHit));
163  if ((listName != null) && (listName.equals("") == false)) {
164  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME.getTypeID(), MODULE_NAME, listName));
165  }
166 
167  //bogus - workaround the dir tree table issue
168  //attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP.getTypeID(), MODULE_NAME, "", ""));
169  //selector
170  if (keywordQuery != null) {
171  BlackboardAttribute.ATTRIBUTE_TYPE selType = keywordQuery.getType();
172  if (selType != null) {
173  attributes.add(new BlackboardAttribute(selType.getTypeID(), MODULE_NAME, termHit));
174  }
175  }
176 
177  if (hit.isArtifactHit()) {
178  attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT.getTypeID(), MODULE_NAME, hit.getArtifact().getArtifactID()));
179  }
180 
181  try {
182  bba.addAttributes(attributes); //write out to bb
183  writeResult.add(attributes);
184  return writeResult;
185  } catch (TskException e) {
186  logger.log(Level.WARNING, "Error adding bb attributes to artifact", e); //NON-NLS
187  }
188  return null;
189  }
190 
199  private List<KeywordHit> performLuceneQuery(boolean snippets) throws NoOpenCoreException {
200  List<KeywordHit> matches = new ArrayList<>();
201  boolean allMatchesFetched = false;
202  final Server solrServer = KeywordSearch.getServer();
203 
204  SolrQuery q = createAndConfigureSolrQuery(snippets);
205 
206  // cycle through results in sets of MAX_RESULTS
207  for (int start = 0; !allMatchesFetched; start = start + MAX_RESULTS) {
208  q.setStart(start);
209 
210  try {
211  QueryResponse response = solrServer.query(q, METHOD.POST);
212  SolrDocumentList resultList = response.getResults();
213 
214  // objectId_chunk -> "text" -> List of previews
215  Map<String, Map<String, List<String>>> highlightResponse = response.getHighlighting();
216 
217  // get the unique set of files with hits
218  Set<SolrDocument> uniqueSolrDocumentsWithHits = filterOneHitPerDocument(resultList);
219 
220  allMatchesFetched = start + MAX_RESULTS >= resultList.getNumFound();
221 
222  SleuthkitCase sleuthkitCase;
223  try {
224  sleuthkitCase = Case.getCurrentCase().getSleuthkitCase();
225  } catch (IllegalStateException ex) {
226  //no case open, must be just closed
227  return matches;
228  }
229 
230  for (SolrDocument resultDoc : uniqueSolrDocumentsWithHits) {
231  KeywordHit contentHit;
232  try {
233  contentHit = createKeywordtHit(resultDoc, highlightResponse, sleuthkitCase);
234  } catch (TskException ex) {
235  return matches;
236  }
237  matches.add(contentHit);
238  }
239 
240  } catch (NoOpenCoreException ex) {
241  logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + keywordString, ex); //NON-NLS
242  throw ex;
243  } catch (KeywordSearchModuleException ex) {
244  logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + keywordString, ex); //NON-NLS
245  }
246 
247  }
248  return matches;
249  }
250 
257  private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
258  SolrQuery q = new SolrQuery();
259  q.setShowDebugInfo(DEBUG); //debug
260  //set query, force quotes/grouping around all literal queries
261  final String groupedQuery = KeywordSearchUtil.quoteQuery(keywordStringEscaped);
262  String theQueryStr = groupedQuery;
263  if (field != null) {
264  //use the optional field
265  StringBuilder sb = new StringBuilder();
266  sb.append(field).append(":").append(groupedQuery);
267  theQueryStr = sb.toString();
268  }
269  q.setQuery(theQueryStr);
270  q.setRows(MAX_RESULTS);
271 
272  q.setFields(Server.Schema.ID.toString());
273 
274  for (KeywordQueryFilter filter : filters) {
275  q.addFilterQuery(filter.toString());
276  }
277 
278  if (snippets) {
279  q.addHighlightField(Server.Schema.TEXT.toString());
280  //q.setHighlightSimplePre("&laquo;"); //original highlighter only
281  //q.setHighlightSimplePost("&raquo;"); //original highlighter only
282  q.setHighlightSnippets(1);
283  q.setHighlightFragsize(SNIPPET_LENGTH);
284 
285  //tune the highlighter
286  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
287  q.setParam("hl.tag.pre", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
288  q.setParam("hl.tag.post", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
289  q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS
290 
291  //Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
292  q.setParam("hl.fragCharSize", Integer.toString(theQueryStr.length())); //makes sense for FastVectorHighlighter only NON-NLS
293 
294  //docs says makes sense for the original Highlighter only, but not really
295  //analyze all content SLOW! consider lowering
296  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
297  }
298 
299  return q;
300  }
301 
309  private Set<SolrDocument> filterOneHitPerDocument(SolrDocumentList resultList) {
310  // sort the list so that we consistently pick the same chunk each time.
311  // note this sort is doing a string comparison and not an integer comparison, so
312  // chunk 10 will be smaller than chunk 9.
313  Collections.sort(resultList, new Comparator<SolrDocument>() {
314  @Override
315  public int compare(SolrDocument left, SolrDocument right) {
316  // ID is in the form of ObjectId_Chunk
317  String leftID = left.getFieldValue(Server.Schema.ID.toString()).toString();
318  String rightID = right.getFieldValue(Server.Schema.ID.toString()).toString();
319  return leftID.compareTo(rightID);
320  }
321  });
322 
323  // NOTE: We could probably just iterate through the list and compare each ID with the
324  // previous ID to get the unique documents faster than using this set now that the list
325  // is sorted.
326 
327  Set<SolrDocument> solrDocumentsWithMatches = new TreeSet<>(new SolrDocumentComparatorIgnoresChunkId());
328  solrDocumentsWithMatches.addAll(resultList);
329  return solrDocumentsWithMatches;
330  }
331 
332  private KeywordHit createKeywordtHit(SolrDocument solrDoc, Map<String, Map<String, List<String>>> highlightResponse, SleuthkitCase caseDb) throws TskException {
337  final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
338  String snippet = "";
339  if (KeywordSearchSettings.getShowSnippets()) {
340  List<String> snippetList = highlightResponse.get(docId).get(Server.Schema.TEXT.toString());
341  // list is null if there wasn't a snippet
342  if (snippetList != null) {
343  snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
344  }
345  }
346  return new KeywordHit(docId, snippet);
347  }
348 
361  public static String querySnippet(String query, long solrObjectId, boolean isRegex, boolean group) throws NoOpenCoreException {
362  return querySnippet(query, solrObjectId, 0, isRegex, group);
363  }
364 
379  public static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {
380  Server solrServer = KeywordSearch.getServer();
381 
382  String highlightField;
383  if (isRegex) {
384  highlightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
385  } else {
386  highlightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
387  }
388 
389  SolrQuery q = new SolrQuery();
390 
391  String queryStr;
392 
393  if (isRegex) {
394  StringBuilder sb = new StringBuilder();
395  sb.append(highlightField).append(":");
396  if (group) {
397  sb.append("\"");
398  }
399  sb.append(query);
400  if (group) {
401  sb.append("\"");
402  }
403 
404  queryStr = sb.toString();
405  } else {
406  //simplify query/escaping and use default field
407  //always force grouping/quotes
408  queryStr = KeywordSearchUtil.quoteQuery(query);
409  }
410 
411  q.setQuery(queryStr);
412 
413  String contentIDStr;
414 
415  if (chunkID == 0) {
416  contentIDStr = Long.toString(solrObjectId);
417  } else {
418  contentIDStr = Server.getChunkIdString(solrObjectId, chunkID);
419  }
420 
421  String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);
422  q.setShowDebugInfo(DEBUG); //debug
423  q.addFilterQuery(idQuery);
424  q.addHighlightField(highlightField);
425  //q.setHighlightSimplePre("&laquo;"); //original highlighter only
426  //q.setHighlightSimplePost("&raquo;"); //original highlighter only
427  q.setHighlightSnippets(1);
428  q.setHighlightFragsize(SNIPPET_LENGTH);
429 
430  //tune the highlighter
431  q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
432  q.setParam("hl.tag.pre", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
433  q.setParam("hl.tag.post", "&laquo;"); //makes sense for FastVectorHighlighter only NON-NLS
434  q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS
435 
436  //Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
437  q.setParam("hl.fragCharSize", Integer.toString(queryStr.length())); //makes sense for FastVectorHighlighter only NON-NLS
438 
439  //docs says makes sense for the original Highlighter only, but not really
440  //analyze all content SLOW! consider lowering
441  q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
442 
443  try {
444  QueryResponse response = solrServer.query(q, METHOD.POST);
445  Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
446  Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIDStr);
447  if (responseHighlightID == null) {
448  return "";
449  }
450  List<String> contentHighlights = responseHighlightID.get(highlightField);
451  if (contentHighlights == null) {
452  return "";
453  } else {
454  // extracted content is HTML-escaped, but snippet goes in a plain text field
455  return EscapeUtil.unEscapeHtml(contentHighlights.get(0)).trim();
456  }
457  } catch (NoOpenCoreException ex) {
458  logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + query, ex); //NON-NLS
459  throw ex;
460  } catch (KeywordSearchModuleException ex) {
461  logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + query, ex); //NON-NLS
462  return "";
463  }
464  }
465 
466  @Override
467  public KeywordList getKeywordList() {
468  return keywordList;
469  }
470 
475  private class SolrDocumentComparatorIgnoresChunkId implements Comparator<SolrDocument> {
476 
477  @Override
478  public int compare(SolrDocument left, SolrDocument right) {
479  // ID is in the form of ObjectId_Chunk
480 
481  final String idName = Server.Schema.ID.toString();
482 
483  // get object id of left doc
484  String leftID = left.getFieldValue(idName).toString();
485  int index = leftID.indexOf(Server.ID_CHUNK_SEP);
486  if (index != -1) {
487  leftID = leftID.substring(0, index);
488  }
489 
490  // get object id of right doc
491  String rightID = right.getFieldValue(idName).toString();
492  index = rightID.indexOf(Server.ID_CHUNK_SEP);
493  if (index != -1) {
494  rightID = rightID.substring(0, index);
495  }
496 
497  Integer leftInt = new Integer(leftID);
498  Integer rightInt = new Integer(rightID);
499  return leftInt.compareTo(rightInt);
500  }
501  }
502 
503 }

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.