Autopsy  4.20.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
FileReaderExtractedText.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2023 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.keywordsearch;
20 
21 import com.google.common.io.CharSource;
22 import java.io.BufferedReader;
23 import java.io.IOException;
24 import java.io.Reader;
25 import java.util.HashMap;
26 import java.util.Map;
27 import java.util.logging.Level;
28 import org.openide.util.NbBundle;
33 import org.sleuthkit.datamodel.AbstractFile;
34 
43 class FileReaderExtractedText implements ExtractedText {
44 
45  private int numPages = 0;
46  private int currentPage = 0;
47  private final AbstractFile abstractFile;
48  private Chunker chunker = null;
49  private static final Logger logger = Logger.getLogger(FileReaderExtractedText.class.getName());
50 
56  FileReaderExtractedText(AbstractFile file) throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
57  this.abstractFile = file;
58  this.numPages = -1; // We don't know how many pages there are until we reach end of the document
59 
60  TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
61 
62  Map<String, String> extractedMetadata = new HashMap<>();
63  Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
64 
65  //Get a reader for the content of the given source
66  BufferedReader reader = new BufferedReader(sourceReader);
67  this.chunker = new Chunker(reader);
68  }
69 
70  @Override
71  public int getCurrentPage() {
72  return this.currentPage;
73  }
74 
75  @Override
76  public boolean hasNextPage() {
77  if (chunker.hasNext()) {
78  return true;
79  }
80  return false;
81  }
82 
83  @Override
84  public boolean hasPreviousPage() {
85  return false;
86  }
87 
88  @Override
89  public int nextPage() {
90  if (!hasNextPage()) {
91  throw new IllegalStateException(
92  NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextPage.exception.msg"));
93  }
94  ++currentPage;
95  return currentPage;
96  }
97 
98  @Override
99  public int previousPage() {
100  if (!hasPreviousPage()) {
101  throw new IllegalStateException(
102  NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousPage.exception.msg"));
103  }
104  --currentPage;
105  return currentPage;
106  }
107 
108  @Override
109  public boolean hasNextItem() {
110  throw new UnsupportedOperationException(
111  NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasNextItem.exception.msg"));
112  }
113 
114  @Override
115  public boolean hasPreviousItem() {
116  throw new UnsupportedOperationException(
117  NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.hasPreviousItem.exception.msg"));
118  }
119 
120  @Override
121  public int nextItem() {
122  throw new UnsupportedOperationException(
123  NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.nextItem.exception.msg"));
124  }
125 
126  @Override
127  public int previousItem() {
128  throw new UnsupportedOperationException(
129  NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.previousItem.exception.msg"));
130  }
131 
132  @Override
133  public int currentItem() {
134  throw new UnsupportedOperationException(
135  NbBundle.getMessage(this.getClass(), "ExtractedContentViewer.currentItem.exception.msg"));
136  }
137 
138  @Override
139  public String getText() {
140  try {
141  return getContentText(currentPage);
142  } catch (Exception ex) {
143  logger.log(Level.SEVERE, "Couldn't get extracted text", ex); //NON-NLS
144  }
145  return Bundle.ExtractedText_errorMessage_errorGettingText();
146  }
147 
148  @NbBundle.Messages({
149  "ExtractedText.FileText=File Text"})
150  @Override
151  public String toString() {
152  return Bundle.ExtractedText_FileText();
153  }
154 
155  @Override
156  public boolean isSearchable() {
157  return false;
158  }
159 
160  @Override
161  public String getAnchorPrefix() {
162  return "";
163  }
164 
165  @Override
166  public int getNumberHits() {
167  return 0;
168  }
169 
170  @Override
171  public int getNumberPages() {
172  return numPages;
173  }
174 
182  private String getContentText(int currentPage) throws TextExtractor.InitReaderException, IOException, Exception {
183  String indexedText;
184  if (chunker.hasNext()) {
185  Chunker.Chunk chunk = chunker.next();
186  chunk.setChunkId(currentPage);
187 
188  if (chunker.hasException()) {
189  logger.log(Level.WARNING, "Error chunking content from " + abstractFile.getId() + ": " + abstractFile.getName(), chunker.getException());
190  throw chunker.getException();
191  }
192 
193  indexedText = chunk.toString();
194  } else {
195  return Bundle.ExtractedText_errorMessage_errorGettingText();
196  }
197 
198  indexedText = EscapeUtil.escapeHtml(indexedText).trim();
199  StringBuilder sb = new StringBuilder(indexedText.length() + 20);
200  sb.append("<pre>").append(indexedText).append("</pre>"); //NON-NLS
201  return sb.toString();
202  }
203 
204  private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
205  Map<String, String> extractedMetadata) throws TextExtractor.InitReaderException {
206 
207  Reader fileText = extractor.getReader();
208  Reader finalReader;
209  try {
210  Map<String, String> metadata = extractor.getMetadata();
211  if (!metadata.isEmpty()) {
212  // save the metadata map to use after this method is complete.
213  extractedMetadata.putAll(metadata);
214  }
215  CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
216  //Append the metadata to end of the file text
217  finalReader = CharSource.concat(new CharSource() {
218  //Wrap fileText reader for concatenation
219  @Override
220  public Reader openStream() throws IOException {
221  return fileText;
222  }
223  }, formattedMetadata).openStream();
224  } catch (IOException ex) {
225  logger.log(Level.WARNING, String.format("Could not format extracted metadata for file %s [id=%d]",
226  aFile.getName(), aFile.getId()), ex);
227  //Just send file text.
228  finalReader = fileText;
229  }
230  //divide into chunks
231  return finalReader;
232  }
233 
234 }

Copyright © 2012-2022 Basis Technology. Generated on: Tue Aug 1 2023
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.