Autopsy  4.16.0
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringExtract.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.coreutils;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import static java.lang.Byte.toUnsignedInt;
24 import java.util.ArrayList;
25 import java.util.Arrays;
26 import java.util.List;
27 import java.util.Properties;
28 import java.util.StringTokenizer;
29 import java.util.logging.Level;
30 
31 import org.openide.util.NbBundle;
33 
43 public class StringExtract {
44 
45  private static final Logger logger = Logger.getLogger(StringExtract.class.getName());
49  public static final int MIN_CHARS_STRING = 4;
54  private List<SCRIPT> enabledScripts;
55  private boolean enableUTF8;
56  private boolean enableUTF16;
57 
58  //stored and reused results
62 
66  private static final List<SCRIPT> SUPPORTED_SCRIPTS
67  = Arrays.asList(
68  SCRIPT.LATIN_1, SCRIPT.LATIN_2, SCRIPT.ARABIC, SCRIPT.CYRILLIC, SCRIPT.HAN,
69  SCRIPT.HIRAGANA, SCRIPT.KATAKANA, SCRIPT.HANGUL,
70  SCRIPT.ARMENIAN, SCRIPT.BENGALI, SCRIPT.KHMER, SCRIPT.ETHIOPIC,
71  SCRIPT.GEORGIAN, SCRIPT.HEBREW, SCRIPT.LAO, SCRIPT.MONGOLIAN, SCRIPT.THAI, SCRIPT.TIBETAN);
72  //current total string buffer, reuse for performance
73  private final StringBuilder curString = new StringBuilder();
74 
79  public StringExtract() {
80  unicodeTable = StringExtractUnicodeTable.getInstance();
81 
82  if (unicodeTable == null) {
83  throw new IllegalStateException(
84  NbBundle.getMessage(StringExtract.class, "StringExtract.illegalStateException.cannotInit.msg"));
85  }
86 
88  enableUTF8 = true;
89  enableUTF16 = true;
90  }
91 
92  public boolean isEnableUTF8() {
93  return enableUTF8;
94  }
95 
96  public void setEnableUTF8(boolean enableUTF8) {
97  this.enableUTF8 = enableUTF8;
98  }
99 
100  public boolean isEnableUTF16() {
101  return enableUTF16;
102  }
103 
104  public void setEnableUTF16(boolean enableUTF16) {
105  this.enableUTF16 = enableUTF16;
106  }
107 
113  public final void setEnabledScripts(List<SCRIPT> scripts) {
114  this.enabledScripts = scripts;
115  }
116 
122  public final void setEnabledScript(SCRIPT script) {
123  this.enabledScripts = new ArrayList<>();
124  this.enabledScripts.add(script);
125  }
126 
134  public static boolean isExtractionSupported(SCRIPT script) {
135  return SUPPORTED_SCRIPTS.contains(script);
136  }
137 
147  public boolean isExtractionEnabled(SCRIPT script) {
148  if (script.equals(SCRIPT.LATIN_1)) {
149  return enabledScripts.contains(SCRIPT.LATIN_1)
150  || enabledScripts.contains(SCRIPT.LATIN_2);
151  } else {
152  return enabledScripts.contains(script);
153  }
154 
155  }
156 
162  public boolean isExtractionLatinBasicOnly() {
163  return enabledScripts.size() == 1
164  && enabledScripts.get(0).equals(SCRIPT.LATIN_1);
165  }
166 
167  public static List<SCRIPT> getSupportedScripts() {
168  return SUPPORTED_SCRIPTS;
169  }
170 
181  public StringExtractResult extract(byte[] buff, int len, int offset) {
182  if (this.enableUTF16 == false && this.enableUTF8 == false) {
183  return new StringExtractResult();
184  }
185 
186  final int buffLen = buff.length;
187 
188  int processedBytes = 0;
189  int curOffset = offset;
190  int startOffset = offset;
191  int curStringLen = 0;
192 
193  //reset curString buffer
194  curString.delete(0, curString.length());
195 
196  //keep track of first byte offset that hasn't been processed
197  //(one byte past the last byte processed in by last extraction)
198  int firstUnprocessedOff = offset;
199 
200  while (curOffset < buffLen) {
201  //shortcut, skip processing empty bytes
202  if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
203  curOffset += 2;
204  continue;
205  }
206 
207  //extract using all methods and see which one wins
208  StringExtractResult resUTF16 = null;
209  boolean runUTF16 = false;
210  if (enableUTF16 && curOffset % 2 == 0) {
211  runUTF16 = true;
212  extractUTF16(buff, len, curOffset, true, resUTF16En1);
213  extractUTF16(buff, len, curOffset, false, resUTF16En2);
214  resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : resUTF16En2;
215  }
216 
217  if (enableUTF8) {
218  extractUTF8(buff, len, curOffset, resUTF8);
219  }
220 
221  StringExtractResult resWin = null;
222  if (enableUTF8 && resUTF16 != null) {
223  resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : resUTF8;
224  } else if (runUTF16) {
225  //Only let resUTF16 "win" if it was actually run.
226  resWin = resUTF16;
227  } else if (enableUTF8) {
228  resWin = resUTF8;
229  }
230 
231  if (resWin != null && resWin.numChars >= MIN_CHARS_STRING) {
232  //record string
233  if (startOffset == offset) {
234  //advance start offset where first string starts it hasn't been advanced
235  startOffset = resWin.offset;
236  }
237  curStringLen += resWin.numChars;
238  curString.append(resWin.textString);
239  curString.append("\n");
240  curStringLen += resWin.numChars + 1;
241 
242  //advance
243  curOffset += resWin.numBytes;
244  processedBytes += resWin.numBytes;
245  firstUnprocessedOff = resWin.offset + resWin.numBytes;
246  } else {
247  //if no encodings worked, advance byte
248  if (enableUTF8 == false) {
249  curOffset += 2;
250  } else {
251  ++curOffset;
252  }
253  }
254  }
255 
256  //build up the final result
258  res.numBytes = processedBytes;
259  res.numChars = curStringLen;
260  res.offset = startOffset;
261  res.textString = curString.toString();
262  res.firstUnprocessedOff = firstUnprocessedOff; //save that of the last winning result
263 
264  return res;
265  }
266 
267  private StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res) {
268  res.reset();
269 
270  int curOffset = offset;
271 
272  final StringBuilder tempString = new StringBuilder();
273 
274  SCRIPT currentScript = SCRIPT.NONE;
275 
276  //while we have 2 byte chunks
277  while (curOffset < len - 1) {
278  int msb, lsb;
279 
280  if (endianSwap) {
281  msb = toUnsignedInt(buff[curOffset++]);
282  lsb = toUnsignedInt(buff[curOffset++]);
283  }
284  else {
285  lsb = toUnsignedInt(buff[curOffset++]);
286  msb = toUnsignedInt(buff[curOffset++]);
287  }
288 
289  //convert the byte sequence to 2 byte char
290  char byteVal = (char) msb;
291  byteVal = (char) (byteVal << 8);
292  byteVal += lsb;
293 
294  //skip if beyond range
295  if (byteVal > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
296  break;
297  }
298 
299  //lookup byteVal in the unicode table
300  SCRIPT scriptFound = unicodeTable.getScript(byteVal);
301 
302  if (scriptFound == SCRIPT.NONE) {
303  break;
304  }
305 
306  /*
307  * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
308  * processed res.numBytes += 2; continue; } else if (inControl) {
309  * break;
310  }
311  */
312  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
313  //allow generic and one of enabled scripts we locked in to
314  if (isGeneric
315  || isExtractionEnabled(scriptFound)) {
316 
317  if (currentScript == SCRIPT.NONE
318  && !isGeneric) {
319  //handle case when this is the first char in the string
320  //lock into the script
321  currentScript = scriptFound;
322  }
323  //check if we are within the same script we are locked on to, or COMMON
324  if (currentScript == scriptFound
325  || isGeneric) {
326  if (res.numChars == 0) {
327  //set the start offset of the string
328  res.offset = curOffset;
329  }
330  //update bytes processed
331  res.numBytes += 2;
332  //append the char
333  ++res.numChars;
334  tempString.append(byteVal);
335  } else {
336  //bail out
337  break;
338  }
339  } else {
340  //bail out
341  break;
342  }
343 
344  } //no more data
345 
346  res.textString = tempString.toString();
347 
348  return res;
349  }
350 
351  private StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res) {
352  res.reset();
353 
354  int curOffset = offset;
355  int curChar; //character being extracted
356  int chBytes; //num bytes consumed by current char (1 - 4)
357 
358  final StringBuilder tempString = new StringBuilder();
359 
360  SCRIPT currentScript = SCRIPT.NONE;
361 
362  //decode and extract a character
363  while (curOffset < len) {
364  // based on "valid UTF-8 byte sequences" in the Unicode 5.0 book
365  final int curByte = toUnsignedInt(buff[curOffset]);
366  if (curByte <= 0x7F) {
367  chBytes = 1;
368  curChar = curByte;
369  } else if (curByte <= 0xC1) {
370  break;
371  } else if (curByte <= 0xDF) {
372  if (len - curOffset < 2) {
373  break;
374  }
375  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
376  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
377  chBytes = 2;
378  curChar = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
379  } else {
380  break;
381  }
382  } else if (curByte == 0xE0) {
383  if (len - curOffset < 3) {
384  break;
385  }
386  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
387  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
388 
389  if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
390  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
391  chBytes = 3;
392  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
393  } else {
394  break;
395  }
396  } else if (curByte <= 0xEC) {
397  if (len - curOffset < 3) {
398  break;
399  }
400  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
401  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
402  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
403  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
404  chBytes = 3;
405  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
406  } else {
407  break;
408  }
409  } else if (curByte == 0xED) {
410  if (len - curOffset < 3) {
411  break;
412  }
413  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
414  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
415  if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
416  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
417  chBytes = 3;
418  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
419  } else {
420  break;
421  }
422  } else if (curByte <= 0xEF) {
423  if (len - curOffset < 3) {
424  break;
425  }
426  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
427  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
428  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
429  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
430  chBytes = 3;
431  curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
432  } else {
433  break;
434  }
435  } else if (curByte == 0xF0) {
436  if (len - curOffset < 4) {
437  break;
438  }
439  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
440  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
441  final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
442  if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
443  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
444  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
445  chBytes = 4;
446  curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
447  } else {
448  break;
449  }
450  } else if (curByte <= 0xF3) {
451  if (len - curOffset < 4) {
452  break;
453  }
454  final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
455  final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
456  final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
457  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
458  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
459  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
460  chBytes = 4;
461  curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
462  } else {
463  break;
464  }
465  } else {
466  break;
467  }
468 
469  curOffset += chBytes;
470 
471  //skip if beyond range
472  if (curChar > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
473  break;
474  }
475 
476  //lookup byteVal in the unicode table
477  SCRIPT scriptFound = unicodeTable.getScript(curChar);
478 
479  if (scriptFound == SCRIPT.NONE) {
480  break;
481  }
482 
483  /*
484  * else if (scriptFound == SCRIPT.CONTROL) { //update bytes
485  * processed res.numBytes += chBytes; continue; } else if
486  * (inControl) { break;
487  }
488  */
489  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
490  //allow generic and one of enabled scripts we locked in to
491  if (isGeneric
492  || isExtractionEnabled(scriptFound)) {
493 
494  if (currentScript == SCRIPT.NONE
495  && !isGeneric) {
496  //handle case when this is the first char in the string
497  //lock into the script
498  currentScript = scriptFound;
499  }
500  //check if we are within the same script we are locked on to, or COMMON
501  if (currentScript == scriptFound
502  || isGeneric) {
503  if (res.numChars == 0) {
504  //set the start byte offset of the string
505  res.offset = curOffset;
506  }
507  //update bytes processed
508  res.numBytes += chBytes;
509  //append the char
510  ++res.numChars;
511  tempString.append((char) curChar);
512  } else {
513  //bail out
514  break;
515  }
516  } else {
517  //bail out
518  break;
519  }
520 
521  } //no more data
522 
523  res.textString = tempString.toString();
524 
525  return res;
526  }
527 
528  /*
529  * Extract UTF8/16 ASCII characters from byte buffer - only works for Latin,
530  * but fast
531  *
532  * The definition of printable are: -- All of the letters, numbers, and
533  * punctuation. -- space and tab -- It does NOT include newlines or control
534  * chars. -- When looking for ASCII strings, they evaluate each byte and
535  * when they find four or more printable characters they get printed out
536  * with a newline in between each string. -- When looking for Unicode
537  * strings, they evaluate each two byte sequence and look for four or more
538  * printable characters…
539  *
540  * @param readBuf the bytes that the string read from @param len buffer
541  * length @param offset offset to start converting from
542  *
543  */
544  public static String extractASCII(byte[] readBuf, int len, int offset) {
545  final StringBuilder result = new StringBuilder();
546  StringBuilder temp = new StringBuilder();
547  int curLen = 0;
548 
549  final char NL = (char) 10; // ASCII char for new line
550  final String NLS = Character.toString(NL);
551  boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
552  for (int i = offset; i < len; i++) {
553  char curChar = (char) toUnsignedInt(readBuf[i]);
554  if (curChar == 0 && singleConsecZero == false) {
555  //preserve the current sequence if max consec. 1 zero char
556  singleConsecZero = true;
557  } else {
558  singleConsecZero = false;
559  }
560  //ignore non-printable ASCII chars
561  if (isPrintableAscii(curChar)) {
562  temp.append(curChar);
563  ++curLen;
564  } else if (!singleConsecZero) {
565  if (curLen >= MIN_CHARS_STRING) {
566  // add to the result and also add the new line at the end
567  result.append(temp);
568  result.append(NLS);
569  }
570  // reset the temp and curLen
571  temp = new StringBuilder();
572  curLen = 0;
573 
574  }
575  }
576 
577  result.append(temp);
578  return result.toString();
579  }
580 
588  public static boolean isPrintableAscii(char c) {
589  return (c >= 32 && c <= 126) || c == 9;
590  }
591 
595  public class StringExtractResult implements Comparable<StringExtractResult> {
596 
597  int offset;
598  int numBytes;
599  int numChars;
600  int firstUnprocessedOff;
601  String textString;
602 
603  void reset() {
604  offset = 0;
605  numBytes = 0;
606  numChars = 0;
607  firstUnprocessedOff = 0;
608  textString = null;
609  }
610 
611  public int getFirstUnprocessedOff() {
612  return firstUnprocessedOff;
613  }
614 
615  public int getStartOffset() {
616  return offset;
617  }
618 
619  public int getNumBytes() {
620  return numBytes;
621  }
622 
623  public int getTextLength() {
624  return numChars;
625  }
626 
627  public String getText() {
628  return textString;
629  }
630 
631  @Override
633  //result with highest num of characters is less than (wins)
634  //TODO handle tie - pick language with smallest number of chars
635  return o.numChars - numChars;
636  }
637  }
638 
645  public static class StringExtractUnicodeTable {
646 
647  public interface LanguageInfo {
648 
649  String getLanguages();
650  }
651 
655  public static enum SCRIPT implements LanguageInfo {
656 
657  NONE {
658  @Override
659  public String getLanguages() {
660  return toString();
661  }
662  },
663  COMMON {
664  @Override
665  public String getLanguages() {
666  return toString();
667  }
668  },
669  LATIN_1 {
670  @Override
671  public String toString() {
672  return "Latin - Basic"; //NON-NLS
673  }
674 
675  @Override
676  public String getLanguages() {
677  return "English"; //NON-NLS
678  }
679  },
680  GREEK {
681  @Override
682  public String toString() {
683  return "Greek"; //NON-NLS
684  }
685 
686  @Override
687  public String getLanguages() {
688  return toString();
689  }
690  },
691  CYRILLIC {
692  @Override
693  public String toString() {
694  return "Cyrillic"; //NON-NLS
695  }
696 
697  @Override
698  public String getLanguages() {
699  return "Russian, Bulgarian, Serbian, Moldovan"; //NON-NLS
700  }
701  },
702  ARMENIAN {
703  @Override
704  public String toString() {
705  return "Armenian"; //NON-NLS
706  }
707 
708  @Override
709  public String getLanguages() {
710  return toString();
711  }
712  },
713  HEBREW {
714  @Override
715  public String toString() {
716  return "Hebrew"; //NON-NLS
717  }
718 
719  @Override
720  public String getLanguages() {
721  return toString();
722  }
723  },
724  ARABIC {
725  @Override
726  public String toString() {
727  return "Arabic"; //NON-NLS
728  }
729 
730  @Override
731  public String getLanguages() {
732  return toString();
733  }
734  },
735  SYRIAC {
736  @Override
737  public String getLanguages() {
738  return toString();
739  }
740  },
741  THAANA {
742  @Override
743  public String getLanguages() {
744  return toString();
745  }
746  },
747  DEVANAGARI {
748  @Override
749  public String getLanguages() {
750  return toString();
751  }
752  },
753  BENGALI {
754  @Override
755  public String toString() {
756  return "Bengali"; //NON-NLS
757  }
758 
759  @Override
760  public String getLanguages() {
761  return toString();
762  }
763  },
764  GURMUKHI {
765  @Override
766  public String getLanguages() {
767  return toString();
768  }
769  },
770  GUJARATI {
771  @Override
772  public String getLanguages() {
773  return toString();
774  }
775  },
776  ORIYA {
777  @Override
778  public String getLanguages() {
779  return toString();
780  }
781  },
782  TAMIL {
783  @Override
784  public String getLanguages() {
785  return toString();
786  }
787  },
788  TELUGU {
789  @Override
790  public String getLanguages() {
791  return toString();
792  }
793  },
794  KANNADA {
795  @Override
796  public String getLanguages() {
797  return toString();
798  }
799  },
800  MALAYALAM {
801  @Override
802  public String getLanguages() {
803  return toString();
804  }
805  },
806  SINHALA {
807  @Override
808  public String getLanguages() {
809  return toString();
810  }
811  },
812  THAI {
813  @Override
814  public String toString() {
815  return "Thai"; //NON-NLS
816  }
817 
818  @Override
819  public String getLanguages() {
820  return toString();
821  }
822  },
823  LAO {
824  @Override
825  public String toString() {
826  return "Laotian"; //NON-NLS
827  }
828 
829  @Override
830  public String getLanguages() {
831  return toString();
832  }
833  },
834  TIBETAN {
835  @Override
836  public String toString() {
837  return "Tibetian"; //NON-NLS
838  }
839 
840  @Override
841  public String getLanguages() {
842  return toString();
843  }
844  },
845  MYANMAR {
846  @Override
847  public String getLanguages() {
848  return toString();
849  }
850  },
851  GEORGIAN {
852  @Override
853  public String toString() {
854  return "Georgian"; //NON-NLS
855  }
856 
857  @Override
858  public String getLanguages() {
859  return toString();
860  }
861  },
862  HANGUL {
863  @Override
864  public String toString() {
865  return "Hangul"; //NON-NLS
866  }
867 
868  @Override
869  public String getLanguages() {
870  return "Korean"; //NON-NLS
871  }
872  },
873  ETHIOPIC {
874  @Override
875  public String toString() {
876  return "Ethiopic"; //NON-NLS
877  }
878 
879  @Override
880  public String getLanguages() {
881  return toString();
882  }
883  },
884  CHEROKEE {
885  @Override
886  public String getLanguages() {
887  return toString();
888  }
889  },
890  CANADIAN_ABORIGINAL {
891  @Override
892  public String getLanguages() {
893  return toString();
894  }
895  },
896  OGHAM {
897  @Override
898  public String getLanguages() {
899  return toString();
900  }
901  },
902  RUNIC {
903  @Override
904  public String getLanguages() {
905  return toString();
906  }
907  },
908  KHMER {
909  @Override
910  public String toString() {
911  return "Khmer"; //NON-NLS
912  }
913 
914  @Override
915  public String getLanguages() {
916  return "Cambodian"; //NON-NLS
917  }
918  },
919  MONGOLIAN {
920  @Override
921  public String toString() {
922  return "Mongolian"; //NON-NLS
923  }
924 
925  @Override
926  public String getLanguages() {
927  return toString();
928  }
929  },
930  HIRAGANA {
931  @Override
932  public String toString() {
933  return "Hiragana"; //NON-NLS
934  }
935 
936  @Override
937  public String getLanguages() {
938  return "Japanese"; //NON-NLS
939  }
940  },
941  KATAKANA {
942  @Override
943  public String toString() {
944  return "Katakana"; //NON-NLS
945  }
946 
947  @Override
948  public String getLanguages() {
949  return "Japanese"; //NON-NLS
950  }
951  },
952  BOPOMOFO {
953  @Override
954  public String getLanguages() {
955  return toString();
956  }
957  },
958  HAN {
959  @Override
960  public String toString() {
961  return "Han"; //NON-NLS
962  }
963 
964  @Override
965  public String getLanguages() {
966  return "Chinese, Japanese, Korean"; //NON-NLS
967  }
968  },
969  YI {
970  @Override
971  public String getLanguages() {
972  return toString();
973  }
974  },
975  OLD_ITALIC {
976  @Override
977  public String getLanguages() {
978  return toString();
979  }
980  },
981  GOTHIC {
982  @Override
983  public String getLanguages() {
984  return toString();
985  }
986  },
987  DESERET {
988  @Override
989  public String getLanguages() {
990  return toString();
991  }
992  },
993  INHERITED {
994  @Override
995  public String getLanguages() {
996  return toString();
997  }
998  },
999  TAGALOG {
1000  @Override
1001  public String getLanguages() {
1002  return toString();
1003  }
1004  },
1005  HANUNOO {
1006  @Override
1007  public String getLanguages() {
1008  return toString();
1009  }
1010  },
1011  BUHID {
1012  @Override
1013  public String getLanguages() {
1014  return toString();
1015  }
1016  },
1017  TAGBANWA {
1018  @Override
1019  public String getLanguages() {
1020  return toString();
1021  }
1022  },
1023  LIMBU {
1024  @Override
1025  public String getLanguages() {
1026  return toString();
1027  }
1028  },
1029  TAI_LE {
1030  @Override
1031  public String getLanguages() {
1032  return toString();
1033  }
1034  },
1035  LINEAR_B {
1036  @Override
1037  public String getLanguages() {
1038  return toString();
1039  }
1040  },
1041  UGARITIC {
1042  @Override
1043  public String getLanguages() {
1044  return toString();
1045  }
1046  },
1047  SHAVIAN {
1048  @Override
1049  public String getLanguages() {
1050  return toString();
1051  }
1052  },
1053  OSMANYA {
1054  @Override
1055  public String getLanguages() {
1056  return toString();
1057  }
1058  },
1059  CYPRIOT {
1060  @Override
1061  public String getLanguages() {
1062  return toString();
1063  }
1064  },
1065  BRAILLE {
1066  @Override
1067  public String getLanguages() {
1068  return toString();
1069  }
1070  },
1071  BUGINESE {
1072  @Override
1073  public String getLanguages() {
1074  return toString();
1075  }
1076  },
1077  COPTIC {
1078  @Override
1079  public String getLanguages() {
1080  return toString();
1081  }
1082  },
1083  NEW_TAI_LUE {
1084  @Override
1085  public String getLanguages() {
1086  return toString();
1087  }
1088  },
1089  GLAGOLITIC {
1090  @Override
1091  public String getLanguages() {
1092  return toString();
1093  }
1094  },
1095  TIFINAGH {
1096  @Override
1097  public String getLanguages() {
1098  return toString();
1099  }
1100  },
1101  SYLOTI_NAGRI {
1102  @Override
1103  public String getLanguages() {
1104  return toString();
1105  }
1106  },
1107  OLD_PERSIAN {
1108  @Override
1109  public String getLanguages() {
1110  return toString();
1111  }
1112  },
1113  KHAROSHTHI {
1114  @Override
1115  public String getLanguages() {
1116  return toString();
1117  }
1118  },
1119  BALINESE {
1120  @Override
1121  public String getLanguages() {
1122  return toString();
1123  }
1124  },
1125  CUNEIFORM {
1126  @Override
1127  public String getLanguages() {
1128  return toString();
1129  }
1130  },
1131  PHOENICIAN {
1132  @Override
1133  public String getLanguages() {
1134  return toString();
1135  }
1136  },
1137  PHAGS_PA {
1138  @Override
1139  public String getLanguages() {
1140  return toString();
1141  }
1142  },
1143  NKO {
1144  @Override
1145  public String getLanguages() {
1146  return toString();
1147  }
1148  },
1149  CONTROL {
1150  @Override
1151  public String getLanguages() {
1152  return toString();
1153  }
1154  },
1155  LATIN_2 {
1156  @Override
1157  public String toString() {
1158  return "Latin - Extended"; //NON-NLS
1159  }
1160 
1161  @Override
1162  public String getLanguages() {
1163  return "European"; //NON-NLS
1164  }
1165  }
1166  };
1167  private static final SCRIPT[] SCRIPT_VALUES = SCRIPT.values();
1168  private static final String PROPERTY_FILE = "StringExtract.properties"; //NON-NLS
1172  private static final int UNICODE_TABLE_SIZE = 65536;
1176  private static final char[] UNICODE_TABLE = new char[UNICODE_TABLE_SIZE];
1177  private static StringExtractUnicodeTable instance = null; //the singleton instance
1178 
1185  public static synchronized StringExtractUnicodeTable getInstance() {
1186  if (instance == null) {
1187  instance = new StringExtractUnicodeTable();
1188  if (!instance.init()) {
1189  //error condition
1190  instance = null;
1191  }
1192 
1193  }
1194  return instance;
1195  }
1196 
1204  public SCRIPT getScript(int value) {
1205  char scriptVal = UNICODE_TABLE[value];
1206  return SCRIPT_VALUES[scriptVal];
1207  }
1208 
1217  public static boolean isGeneric(SCRIPT script) {
1218  return script == SCRIPT.COMMON; // || script == SCRIPT.LATIN_1;
1219  }
1220 
1221  public static int getUnicodeTableSize() {
1222  return UNICODE_TABLE_SIZE;
1223  }
1224 
1232  public static int getScriptValue(SCRIPT script) {
1233  return script.ordinal();
1234  }
1235 
1236  public static SCRIPT scriptForString(String scriptStringVal) {
1237  SCRIPT script = SCRIPT.valueOf(scriptStringVal);
1238  return script;
1239  }
1240 
1246  private boolean init() {
1247  Properties properties = new Properties();
1248  try {
1249  //properties.load(new FileInputStream("StringExtract.properties"));
1250  InputStream inputStream = StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1251  properties.load(inputStream);
1252  String table = properties.getProperty("UnicodeTable");
1253  StringTokenizer st = new StringTokenizer(table, " ");
1254  int toks = st.countTokens();
1255  //logger.log(Level.INFO, "TABLE TOKS: " + toks);
1256  if (toks != UNICODE_TABLE_SIZE) {
1257  logger.log(Level.WARNING, "Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, ", have: " + toks); //NON-NLS
1258  return false;
1259  }
1260 
1261  int tableIndex = 0;
1262  while (st.hasMoreTokens()) {
1263  String tok = st.nextToken();
1264  char code = (char) Integer.parseInt(tok);
1265  UNICODE_TABLE[tableIndex++] = code;
1266  }
1267 
1268  logger.log(Level.INFO, "initialized, unicode table loaded"); //NON-NLS
1269 
1270  } catch (IOException ex) {
1271  logger.log(Level.WARNING, "Could not load" + PROPERTY_FILE); //NON-NLS
1272  return false;
1273  }
1274 
1275  return true;
1276 
1277  }
1278  }
1279 }
static final List< SCRIPT > SUPPORTED_SCRIPTS
StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res)
static boolean isExtractionSupported(SCRIPT script)
StringExtractResult extract(byte[] buff, int len, int offset)
static synchronized StringExtractUnicodeTable getInstance()
final void setEnabledScripts(List< SCRIPT > scripts)
static String extractASCII(byte[] readBuf, int len, int offset)
final StringExtractUnicodeTable unicodeTable
synchronized static Logger getLogger(String name)
Definition: Logger.java:124
StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res)

Copyright © 2012-2020 Basis Technology. Generated on: Tue Sep 22 2020
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.