19 package org.sleuthkit.autopsy.keywordsearch;
21 import com.google.common.io.CharSource;
22 import java.io.BufferedReader;
23 import java.io.IOException;
24 import java.io.Reader;
25 import java.util.HashMap;
27 import java.util.logging.Level;
28 import org.openide.util.NbBundle;
43 class FileReaderExtractedText
implements ExtractedText {
45 private int numPages = 0;
46 private int currentPage = 0;
47 private final AbstractFile abstractFile;
48 private Chunker chunker = null;
49 private static final Logger logger = Logger.getLogger(FileReaderExtractedText.class.getName());
56 FileReaderExtractedText(AbstractFile file)
throws TextExtractorFactory.NoTextExtractorFound, TextExtractor.InitReaderException {
57 this.abstractFile = file;
60 TextExtractor extractor = TextExtractorFactory.getExtractor(abstractFile, null);
62 Map<String, String> extractedMetadata =
new HashMap<>();
63 Reader sourceReader = getTikaOrTextExtractor(extractor, abstractFile, extractedMetadata);
66 BufferedReader reader =
new BufferedReader(sourceReader);
67 this.chunker =
new Chunker(reader);
71 public int getCurrentPage() {
72 return this.currentPage;
76 public boolean hasNextPage() {
77 if (chunker.hasNext()) {
84 public boolean hasPreviousPage() {
89 public int nextPage() {
91 throw new IllegalStateException(
92 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.nextPage.exception.msg"));
99 public int previousPage() {
100 if (!hasPreviousPage()) {
101 throw new IllegalStateException(
102 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.previousPage.exception.msg"));
109 public boolean hasNextItem() {
110 throw new UnsupportedOperationException(
111 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.hasNextItem.exception.msg"));
115 public boolean hasPreviousItem() {
116 throw new UnsupportedOperationException(
117 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.hasPreviousItem.exception.msg"));
121 public int nextItem() {
122 throw new UnsupportedOperationException(
123 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.nextItem.exception.msg"));
127 public int previousItem() {
128 throw new UnsupportedOperationException(
129 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.previousItem.exception.msg"));
133 public int currentItem() {
134 throw new UnsupportedOperationException(
135 NbBundle.getMessage(
this.getClass(),
"ExtractedContentViewer.currentItem.exception.msg"));
139 public String getText() {
141 return getContentText(currentPage);
142 }
catch (Exception ex) {
143 logger.log(Level.SEVERE,
"Couldn't get extracted text", ex);
145 return Bundle.ExtractedText_errorMessage_errorGettingText();
149 "ExtractedText.FileText=File Text"})
151 public String toString() {
152 return Bundle.ExtractedText_FileText();
156 public boolean isSearchable() {
161 public String getAnchorPrefix() {
166 public int getNumberHits() {
171 public int getNumberPages() {
182 private String getContentText(
int currentPage)
throws TextExtractor.InitReaderException, IOException, Exception {
184 if (chunker.hasNext()) {
185 Chunker.Chunk chunk = chunker.next();
186 chunk.setChunkId(currentPage);
188 if (chunker.hasException()) {
189 logger.log(Level.WARNING,
"Error chunking content from " + abstractFile.getId() +
": " + abstractFile.getName(), chunker.getException());
190 throw chunker.getException();
193 indexedText = chunk.toString();
195 return Bundle.ExtractedText_errorMessage_errorGettingText();
198 indexedText = EscapeUtil.escapeHtml(indexedText).trim();
199 StringBuilder sb =
new StringBuilder(indexedText.length() + 20);
200 sb.append(
"<pre>").append(indexedText).append(
"</pre>");
201 return sb.toString();
204 private Reader getTikaOrTextExtractor(TextExtractor extractor, AbstractFile aFile,
205 Map<String, String> extractedMetadata)
throws TextExtractor.InitReaderException {
207 Reader fileText = extractor.getReader();
210 Map<String, String> metadata = extractor.getMetadata();
211 if (!metadata.isEmpty()) {
213 extractedMetadata.putAll(metadata);
215 CharSource formattedMetadata = KeywordSearchIngestModule.getMetaDataCharSource(metadata);
217 finalReader = CharSource.concat(
new CharSource() {
220 public Reader openStream() throws IOException {
223 }, formattedMetadata).openStream();
224 }
catch (IOException ex) {
225 logger.log(Level.WARNING, String.format(
"Could not format extracted metadata for file %s [id=%d]",
226 aFile.getName(), aFile.getId()), ex);
228 finalReader = fileText;