Autopsy  3.1
Graphical digital forensics platform for The Sleuth Kit and other tools.
StringExtract.java
Go to the documentation of this file.
1 /*
2  * Autopsy Forensic Browser
3  *
4  * Copyright 2012 Basis Technology Corp.
5  * Contact: carrier <at> sleuthkit <dot> org
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.sleuthkit.autopsy.coreutils;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.Properties;
27 import java.util.StringTokenizer;
28 import java.util.logging.Level;
29 
30 import org.openide.util.NbBundle;
33 
43 public class StringExtract {
44 
45  private static final Logger logger = Logger.getLogger(StringExtract.class.getName());
49  public static final int MIN_CHARS_STRING = 4;
54  private List<SCRIPT> enabledScripts;
55  private boolean enableUTF8;
56  private boolean enableUTF16;
57 
58  //stored and reused results
62 
66  private static final List<SCRIPT> SUPPORTED_SCRIPTS =
67  Arrays.asList(
68  SCRIPT.LATIN_1, SCRIPT.LATIN_2, SCRIPT.ARABIC, SCRIPT.CYRILLIC, SCRIPT.HAN,
69  SCRIPT.HIRAGANA, SCRIPT.KATAKANA, SCRIPT.HANGUL,
70  SCRIPT.ARMENIAN, SCRIPT.BENGALI, SCRIPT.KHMER, SCRIPT.ETHIOPIC,
71  SCRIPT.GEORGIAN, SCRIPT.HEBREW, SCRIPT.LAO, SCRIPT.MONGOLIAN, SCRIPT.THAI, SCRIPT.TIBETAN);
72  //current total string buffer, reuse for performance
73  private final StringBuilder curString = new StringBuilder();
74 
79  public StringExtract() {
80  unicodeTable = StringExtractUnicodeTable.getInstance();
81 
82  if (unicodeTable == null) {
83  throw new IllegalStateException(
84  NbBundle.getMessage(StringExtract.class, "StringExtract.illegalStateException.cannotInit.msg"));
85  }
86 
87  setEnabledScripts(SUPPORTED_SCRIPTS);
88  enableUTF8 = true;
89  enableUTF16 = true;
90  }
91 
92  public boolean isEnableUTF8() {
93  return enableUTF8;
94  }
95 
96  public void setEnableUTF8(boolean enableUTF8) {
97  this.enableUTF8 = enableUTF8;
98  }
99 
100  public boolean isEnableUTF16() {
101  return enableUTF16;
102  }
103 
104  public void setEnableUTF16(boolean enableUTF16) {
105  this.enableUTF16 = enableUTF16;
106  }
107 
108 
109 
115  public final void setEnabledScripts(List<SCRIPT> scripts) {
116  this.enabledScripts = scripts;
117  }
118 
119 
125  public final void setEnabledScript(SCRIPT script) {
126 
127  this.enabledScripts = new ArrayList<SCRIPT>();
128  this.enabledScripts.add(script);
129  }
130 
137  public static boolean isExtractionSupported(SCRIPT script) {
138  return SUPPORTED_SCRIPTS.contains(script);
139  }
140 
149  public boolean isExtractionEnabled(SCRIPT script) {
150  if (script.equals(SCRIPT.LATIN_1)) {
151  return enabledScripts.contains(SCRIPT.LATIN_1)
152  || enabledScripts.contains(SCRIPT.LATIN_2);
153  } else {
154  return enabledScripts.contains(script);
155  }
156 
157  }
158 
163  public boolean isExtractionLatinBasicOnly() {
164  if (enabledScripts.size() == 1
165  && enabledScripts.get(0).equals(SCRIPT.LATIN_1)) {
166  return true;
167  }
168  else {
169  return false;
170  }
171  }
172 
173  public static List<SCRIPT> getSupportedScripts() {
174  return SUPPORTED_SCRIPTS;
175  }
176 
186  public StringExtractResult extract(byte[] buff, int len, int offset) {
187  if (this.enableUTF16 == false && this.enableUTF8 == false) {
188  return new StringExtractResult();
189  }
190 
191  final int buffLen = buff.length;
192 
193  int processedBytes = 0;
194  int curOffset = offset;
195  int startOffset = offset;
196  int curStringLen = 0;
197 
198  //reset curString buffer
199  curString.delete(0, curString.length());
200 
201  //keep track of first byte offset that hasn't been processed
202  //(one byte past the last byte processed in by last extraction)
203  int firstUnprocessedOff = offset;
204 
205  while (curOffset < buffLen) {
206  //shortcut, skip processing empty bytes
207  if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
208  curOffset += 2;
209  continue;
210  }
211 
212  //extract using all methods and see which one wins
213  StringExtractResult resUTF16 = null;
214  boolean runUTF16 = false;
215  if (enableUTF16 && curOffset % 2 == 0) {
216  runUTF16 = true;
217  extractUTF16(buff, len, curOffset, true, resUTF16En1);
218  extractUTF16(buff, len, curOffset, false, resUTF16En2);
219  resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : resUTF16En2;
220  }
221 
222  if (enableUTF8) {
223  extractUTF8(buff, len, curOffset, resUTF8);
224  }
225 
226  StringExtractResult resWin = null;
227  if (enableUTF8 && enableUTF16) {
228  resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : resUTF8;
229  } else if (enableUTF16){
230  resWin = resUTF16;
231  }
232  else if (enableUTF8) {
233  resWin = resUTF8;
234  }
235 
236  if (resWin.numChars >= MIN_CHARS_STRING) {
237  //record string
238  if (startOffset == offset) {
239  //advance start offset where first string starts it hasn't been advanced
240  startOffset = resWin.offset;
241  }
242  curStringLen += resWin.numChars;
243  curString.append(resWin.textString);
244  curString.append("\n");
245  curStringLen += resWin.numChars + 1;
246 
247  //advance
248  curOffset += resWin.numBytes;
249  processedBytes += resWin.numBytes;
250  firstUnprocessedOff = resWin.offset + resWin.numBytes;
251  } else {
252  //if no encodings worked, advance byte
253  if (enableUTF8 == false) {
254  curOffset += 2;
255  } else {
256  ++curOffset;
257  }
258  }
259  }
260 
261  //build up the final result
263  res.numBytes = processedBytes;
264  res.numChars = curStringLen;
265  res.offset = startOffset;
266  res.textString = curString.toString();
267  res.firstUnprocessedOff = firstUnprocessedOff; //save that of the last winning result
268 
269  return res;
270  }
271 
272  private StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res) {
273  res.reset();
274 
275  int curOffset = offset;
276 
277  final StringBuilder tempString = new StringBuilder();
278 
279  SCRIPT currentScript = SCRIPT.NONE;
280 
281  boolean inControl = false;
282 
283  //while we have 2 byte chunks
284  byte[] b = new byte[2];
285  while (curOffset < len - 1) {
286  b[0] = buff[curOffset++];
287  b[1] = buff[curOffset++];
288 
289  if (endianSwap) {
290  byte temp = b[0];
291  b[0] = b[1];
292  b[1] = temp;
293  }
294 
295  //convert the byte sequence to 2 byte char
296  //ByteBuffer bb = ByteBuffer.wrap(b);
297  //int byteVal = bb.getInt();
298  char byteVal = (char) b[1];
299  byteVal = (char) (byteVal << 8);
300  byteVal += b[0];
301 
302  //skip if beyond range
303  if (byteVal > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
304  break;
305  }
306 
307  //lookup byteVal in the unicode table
308  SCRIPT scriptFound = unicodeTable.getScript(byteVal);
309 
310  if (scriptFound == SCRIPT.NONE) {
311  break;
312  }
313 
314  /*
315  else if (scriptFound == SCRIPT.CONTROL) {
316  //update bytes processed
317  res.numBytes += 2;
318  continue;
319  } else if (inControl) {
320  break;
321  }*/
322 
323 
324  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
325  //allow generic and one of enabled scripts we locked in to
326  if (isGeneric
327  || isExtractionEnabled(scriptFound)) {
328 
329  if (currentScript == SCRIPT.NONE
330  && !isGeneric) {
331  //handle case when this is the first char in the string
332  //lock into the script
333  currentScript = scriptFound;
334  }
335  //check if we are within the same script we are locked on to, or COMMON
336  if (currentScript == scriptFound
337  || isGeneric) {
338  if (res.numChars == 0) {
339  //set the start offset of the string
340  res.offset = curOffset;
341  }
342  //update bytes processed
343  res.numBytes += 2;
344  //append the char
345  ++res.numChars;
346  tempString.append(byteVal);
347  } else {
348  //bail out
349  break;
350  }
351  } else {
352  //bail out
353  break;
354  }
355 
356  } //no more data
357 
358  res.textString = tempString.toString();
359 
360  return res;
361  }
362 
363  private StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res) {
364  res.reset();
365 
366  int curOffset = offset;
367  int ch = 0; //character being extracted
368  int chBytes; //num bytes consumed by current char (1 - 4)
369 
370  final StringBuilder tempString = new StringBuilder();
371 
372  SCRIPT currentScript = SCRIPT.NONE;
373 
374  boolean inControl = false;
375 
376  //decode and extract a character
377  while (curOffset < len) {
378  // based on "valid UTF-8 byte sequences" in the Unicode 5.0 book
379  final int curByte = buff[curOffset] & 0xFF; //ensure we are not comparing signed bytes to ints
380  if (curByte <= 0x7F) {
381  chBytes = 1;
382  ch = curByte;
383  } else if (curByte <= 0xC1) {
384  break;
385  } else if (curByte <= 0xDF) {
386  if (len - curOffset < 2) {
387  break;
388  }
389  final int curByte_1 = buff[curOffset + 1] & 0xFF;
390  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
391  chBytes = 2;
392  ch = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
393  } else {
394  break;
395  }
396  } else if (curByte == 0xE0) {
397  if (len - curOffset < 3) {
398  break;
399  }
400  final int curByte_1 = buff[curOffset + 1] & 0xFF;
401  final int curByte_2 = buff[curOffset + 2] & 0xFF;
402 
403  if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
404  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
405  chBytes = 3;
406  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
407  } else {
408  break;
409  }
410  } else if (curByte <= 0xEC) {
411  if (len - curOffset < 3) {
412  break;
413  }
414  final int curByte_1 = buff[curOffset + 1] & 0xFF;
415  final int curByte_2 = buff[curOffset + 2] & 0xFF;
416  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
417  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
418  chBytes = 3;
419  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
420  } else {
421  break;
422  }
423  } else if (curByte == 0xED) {
424  if (len - curOffset < 3) {
425  break;
426  }
427  final int curByte_1 = buff[curOffset + 1] & 0xFF;
428  final int curByte_2 = buff[curOffset + 2] & 0xFF;
429  if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
430  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
431  chBytes = 3;
432  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
433  } else {
434  break;
435  }
436  } else if (curByte <= 0xEF) {
437  if (len - curOffset < 3) {
438  break;
439  }
440  final int curByte_1 = buff[curOffset + 1] & 0xFF;
441  final int curByte_2 = buff[curOffset + 2] & 0xFF;
442  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
443  && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
444  chBytes = 3;
445  ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
446  } else {
447  break;
448  }
449  } else if (curByte == 0xF0) {
450  if (len - curOffset < 4) {
451  break;
452  }
453  final int curByte_1 = buff[curOffset + 1] & 0xFF;
454  final int curByte_2 = buff[curOffset + 2] & 0xFF;
455  final int curByte_3 = buff[curOffset + 3] & 0xFF;
456  if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
457  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
458  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
459  chBytes = 4;
460  ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
461  } else {
462  break;
463  }
464  } else if (curByte <= 0xF3) {
465  if (len - curOffset < 4) {
466  break;
467  }
468  final int curByte_1 = buff[curOffset + 1] & 0xFF;
469  final int curByte_2 = buff[curOffset + 2] & 0xFF;
470  final int curByte_3 = buff[curOffset + 3] & 0xFF;
471  if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
472  && curByte_2 >= 0x80 && curByte_2 <= 0xBF
473  && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
474  chBytes = 4;
475  ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
476  } else {
477  break;
478  }
479  } else {
480  break;
481  }
482 
483 
484  curOffset += chBytes;
485 
486  //skip if beyond range
488  break;
489  }
490 
491  //lookup byteVal in the unicode table
492  SCRIPT scriptFound = unicodeTable.getScript(ch);
493 
494  if (scriptFound == SCRIPT.NONE) {
495  break;
496  }
497 
498  /*else if (scriptFound == SCRIPT.CONTROL) {
499  //update bytes processed
500  res.numBytes += chBytes;
501  continue;
502  } else if (inControl) {
503  break;
504  }*/
505 
506  final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
507  //allow generic and one of enabled scripts we locked in to
508  if (isGeneric
509  || isExtractionEnabled(scriptFound)) {
510 
511  if (currentScript == SCRIPT.NONE
512  && !isGeneric) {
513  //handle case when this is the first char in the string
514  //lock into the script
515  currentScript = scriptFound;
516  }
517  //check if we are within the same script we are locked on to, or COMMON
518  if (currentScript == scriptFound
519  || isGeneric) {
520  if (res.numChars == 0) {
521  //set the start byte offset of the string
522  res.offset = curOffset;
523  }
524  //update bytes processed
525  res.numBytes += chBytes;
526  //append the char
527  ++res.numChars;
528  tempString.append((char) ch);
529  } else {
530  //bail out
531  break;
532  }
533  } else {
534  //bail out
535  break;
536  }
537 
538  } //no more data
539 
540  res.textString = tempString.toString();
541 
542  return res;
543  }
544 
545  /*
546  * Extract UTF8/16 ASCII characters from byte buffer - only works for Latin, but fast
547  *
548  * The definition of printable are:
549  * -- All of the letters, numbers, and punctuation.
550  * -- space and tab
551  * -- It does NOT include newlines or control chars.
552  * -- When looking for ASCII strings, they evaluate each byte and when they find four or more printable characters they get printed out with a newline in between each string.
553  * -- When looking for Unicode strings, they evaluate each two byte sequence and look for four or more printable characters…
554  *
555  * @param readBuf the bytes that the string read from
556  * @param len buffer length
557  * @param offset offset to start converting from
558  *
559  */
560  public static String extractASCII(byte[] readBuf, int len, int offset) {
561  final StringBuilder result = new StringBuilder();
562  StringBuilder temp = new StringBuilder();
563  int curLen = 0;
564 
565  final char NL = (char) 10; // ASCII char for new line
566  final String NLS = Character.toString(NL);
567  boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
568  for (int i = offset; i < len; i++) {
569  char curChar = (char) readBuf[i];
570  if (curChar == 0 && singleConsecZero == false) {
571  //preserve the current sequence if max consec. 1 zero char
572  singleConsecZero = true;
573  } else {
574  singleConsecZero = false;
575  }
576  //ignore non-printable ASCII chars
577  if (isPrintableAscii(curChar)) {
578  temp.append(curChar);
579  ++curLen;
580  } else if (!singleConsecZero) {
581  if (curLen >= MIN_CHARS_STRING) {
582  // add to the result and also add the new line at the end
583  result.append(temp);
584  result.append(NLS);
585  }
586  // reset the temp and curLen
587  temp = new StringBuilder();
588  curLen = 0;
589 
590  }
591  }
592 
593  result.append(temp);
594  return result.toString();
595  }
596 
603  public static boolean isPrintableAscii(char c) {
604  return (c >= 32 && c <= 126) || c == 9;
605  }
606 
607 
611  public class StringExtractResult implements Comparable<StringExtractResult> {
612 
613  int offset;
614  int numBytes;
615  int numChars;
616  int firstUnprocessedOff;
617  String textString;
618 
619 
620  void reset() {
621  offset = 0;
622  numBytes = 0;
623  numChars = 0;
624  firstUnprocessedOff = 0;
625  textString = null;
626  }
627 
628  public int getFirstUnprocessedOff() {
629  return firstUnprocessedOff;
630  }
631 
632  public int getStartOffset() {
633  return offset;
634  }
635 
636  public int getNumBytes() {
637  return numBytes;
638  }
639 
640  public int getTextLength() {
641  return numChars;
642  }
643 
644  public String getText() {
645  return textString;
646  }
647 
648  @Override
650  //result with highest num of characters is less than (wins)
651  //TODO handle tie - pick language with smallest number of chars
652  return o.numChars - numChars;
653  }
654  }
655 
662  public static class StringExtractUnicodeTable {
663 
664  public interface LanguageInfo {
665 
666  String getLanguages();
667  }
668 
672  public static enum SCRIPT implements LanguageInfo {
673 
674  NONE {
675  @Override
676  public String getLanguages() {
677  return toString();
678  }
679  },
680  COMMON {
681  @Override
682  public String getLanguages() {
683  return toString();
684  }
685  },
686  LATIN_1 {
687  @Override
688  public String toString() {
689  return "Latin - Basic"; //NON-NLS
690  }
691 
692  @Override
693  public String getLanguages() {
694  return "English"; //NON-NLS
695  }
696  },
697  GREEK {
698  @Override
699  public String toString() {
700  return "Greek"; //NON-NLS
701  }
702 
703  @Override
704  public String getLanguages() {
705  return toString();
706  }
707  },
708  CYRILLIC {
709  @Override
710  public String toString() {
711  return "Cyrillic"; //NON-NLS
712  }
713 
714  @Override
715  public String getLanguages() {
716  return "Russian, Bulgarian, Serbian, Moldovan"; //NON-NLS
717  }
718  },
719  ARMENIAN {
720  @Override
721  public String toString() {
722  return "Armenian"; //NON-NLS
723  }
724 
725  @Override
726  public String getLanguages() {
727  return toString();
728  }
729  },
730  HEBREW {
731  @Override
732  public String toString() {
733  return "Hebrew"; //NON-NLS
734  }
735 
736  @Override
737  public String getLanguages() {
738  return toString();
739  }
740  },
741  ARABIC {
742  @Override
743  public String toString() {
744  return "Arabic"; //NON-NLS
745  }
746 
747  @Override
748  public String getLanguages() {
749  return toString();
750  }
751  },
752  SYRIAC {
753  @Override
754  public String getLanguages() {
755  return toString();
756  }
757  },
758  THAANA {
759  @Override
760  public String getLanguages() {
761  return toString();
762  }
763  },
764  DEVANAGARI {
765  @Override
766  public String getLanguages() {
767  return toString();
768  }
769  },
770  BENGALI {
771  @Override
772  public String toString() {
773  return "Bengali"; //NON-NLS
774  }
775 
776  @Override
777  public String getLanguages() {
778  return toString();
779  }
780  },
781  GURMUKHI {
782  @Override
783  public String getLanguages() {
784  return toString();
785  }
786  },
787  GUJARATI {
788  @Override
789  public String getLanguages() {
790  return toString();
791  }
792  },
793  ORIYA {
794  @Override
795  public String getLanguages() {
796  return toString();
797  }
798  },
799  TAMIL {
800  @Override
801  public String getLanguages() {
802  return toString();
803  }
804  },
805  TELUGU {
806  @Override
807  public String getLanguages() {
808  return toString();
809  }
810  },
811  KANNADA {
812  @Override
813  public String getLanguages() {
814  return toString();
815  }
816  },
817  MALAYALAM {
818  @Override
819  public String getLanguages() {
820  return toString();
821  }
822  },
823  SINHALA {
824  @Override
825  public String getLanguages() {
826  return toString();
827  }
828  },
829  THAI {
830  @Override
831  public String toString() {
832  return "Thai"; //NON-NLS
833  }
834 
835  @Override
836  public String getLanguages() {
837  return toString();
838  }
839  },
840  LAO {
841  @Override
842  public String toString() {
843  return "Laotian"; //NON-NLS
844  }
845 
846  @Override
847  public String getLanguages() {
848  return toString();
849  }
850  },
851  TIBETAN {
852  @Override
853  public String toString() {
854  return "Tibetian"; //NON-NLS
855  }
856 
857  @Override
858  public String getLanguages() {
859  return toString();
860  }
861  },
862  MYANMAR {
863  @Override
864  public String getLanguages() {
865  return toString();
866  }
867  },
868  GEORGIAN {
869  @Override
870  public String toString() {
871  return "Georgian"; //NON-NLS
872  }
873 
874  @Override
875  public String getLanguages() {
876  return toString();
877  }
878  },
879  HANGUL {
880  @Override
881  public String toString() {
882  return "Hangul"; //NON-NLS
883  }
884 
885  @Override
886  public String getLanguages() {
887  return "Korean"; //NON-NLS
888  }
889  },
890  ETHIOPIC {
891  @Override
892  public String toString() {
893  return "Ethiopic"; //NON-NLS
894  }
895 
896  @Override
897  public String getLanguages() {
898  return toString();
899  }
900  },
901  CHEROKEE {
902  @Override
903  public String getLanguages() {
904  return toString();
905  }
906  },
907  CANADIAN_ABORIGINAL {
908  @Override
909  public String getLanguages() {
910  return toString();
911  }
912  },
913  OGHAM {
914  @Override
915  public String getLanguages() {
916  return toString();
917  }
918  },
919  RUNIC {
920  @Override
921  public String getLanguages() {
922  return toString();
923  }
924  },
925  KHMER {
926  @Override
927  public String toString() {
928  return "Khmer"; //NON-NLS
929  }
930 
931  @Override
932  public String getLanguages() {
933  return "Cambodian"; //NON-NLS
934  }
935  },
936  MONGOLIAN {
937  @Override
938  public String toString() {
939  return "Mongolian"; //NON-NLS
940  }
941 
942  @Override
943  public String getLanguages() {
944  return toString();
945  }
946  },
947  HIRAGANA {
948  @Override
949  public String toString() {
950  return "Hiragana"; //NON-NLS
951  }
952 
953  @Override
954  public String getLanguages() {
955  return "Japanese"; //NON-NLS
956  }
957  },
958  KATAKANA {
959  @Override
960  public String toString() {
961  return "Katakana"; //NON-NLS
962  }
963 
964  @Override
965  public String getLanguages() {
966  return "Japanese"; //NON-NLS
967  }
968  },
969  BOPOMOFO {
970  @Override
971  public String getLanguages() {
972  return toString();
973  }
974  },
975  HAN {
976  @Override
977  public String toString() {
978  return "Han"; //NON-NLS
979  }
980 
981  @Override
982  public String getLanguages() {
983  return "Chinese, Japanese, Korean"; //NON-NLS
984  }
985  },
986  YI {
987  @Override
988  public String getLanguages() {
989  return toString();
990  }
991  },
992  OLD_ITALIC {
993  @Override
994  public String getLanguages() {
995  return toString();
996  }
997  },
998  GOTHIC {
999  @Override
1000  public String getLanguages() {
1001  return toString();
1002  }
1003  },
1004  DESERET {
1005  @Override
1006  public String getLanguages() {
1007  return toString();
1008  }
1009  },
1010  INHERITED {
1011  @Override
1012  public String getLanguages() {
1013  return toString();
1014  }
1015  },
1016  TAGALOG {
1017  @Override
1018  public String getLanguages() {
1019  return toString();
1020  }
1021  },
1022  HANUNOO {
1023  @Override
1024  public String getLanguages() {
1025  return toString();
1026  }
1027  },
1028  BUHID {
1029  @Override
1030  public String getLanguages() {
1031  return toString();
1032  }
1033  },
1034  TAGBANWA {
1035  @Override
1036  public String getLanguages() {
1037  return toString();
1038  }
1039  },
1040  LIMBU {
1041  @Override
1042  public String getLanguages() {
1043  return toString();
1044  }
1045  },
1046  TAI_LE {
1047  @Override
1048  public String getLanguages() {
1049  return toString();
1050  }
1051  },
1052  LINEAR_B {
1053  @Override
1054  public String getLanguages() {
1055  return toString();
1056  }
1057  },
1058  UGARITIC {
1059  @Override
1060  public String getLanguages() {
1061  return toString();
1062  }
1063  },
1064  SHAVIAN {
1065  @Override
1066  public String getLanguages() {
1067  return toString();
1068  }
1069  },
1070  OSMANYA {
1071  @Override
1072  public String getLanguages() {
1073  return toString();
1074  }
1075  },
1076  CYPRIOT {
1077  @Override
1078  public String getLanguages() {
1079  return toString();
1080  }
1081  },
1082  BRAILLE {
1083  @Override
1084  public String getLanguages() {
1085  return toString();
1086  }
1087  },
1088  BUGINESE {
1089  @Override
1090  public String getLanguages() {
1091  return toString();
1092  }
1093  },
1094  COPTIC {
1095  @Override
1096  public String getLanguages() {
1097  return toString();
1098  }
1099  },
1100  NEW_TAI_LUE {
1101  @Override
1102  public String getLanguages() {
1103  return toString();
1104  }
1105  },
1106  GLAGOLITIC {
1107  @Override
1108  public String getLanguages() {
1109  return toString();
1110  }
1111  },
1112  TIFINAGH {
1113  @Override
1114  public String getLanguages() {
1115  return toString();
1116  }
1117  },
1118  SYLOTI_NAGRI {
1119  @Override
1120  public String getLanguages() {
1121  return toString();
1122  }
1123  },
1124  OLD_PERSIAN {
1125  @Override
1126  public String getLanguages() {
1127  return toString();
1128  }
1129  },
1130  KHAROSHTHI {
1131  @Override
1132  public String getLanguages() {
1133  return toString();
1134  }
1135  },
1136  BALINESE {
1137  @Override
1138  public String getLanguages() {
1139  return toString();
1140  }
1141  },
1142  CUNEIFORM {
1143  @Override
1144  public String getLanguages() {
1145  return toString();
1146  }
1147  },
1148  PHOENICIAN {
1149  @Override
1150  public String getLanguages() {
1151  return toString();
1152  }
1153  },
1154  PHAGS_PA {
1155  @Override
1156  public String getLanguages() {
1157  return toString();
1158  }
1159  },
1160  NKO {
1161  @Override
1162  public String getLanguages() {
1163  return toString();
1164  }
1165  },
1166  CONTROL {
1167  @Override
1168  public String getLanguages() {
1169  return toString();
1170  }
1171  },
1172  LATIN_2 {
1173  @Override
1174  public String toString() {
1175  return "Latin - Extended"; //NON-NLS
1176  }
1177 
1178  @Override
1179  public String getLanguages() {
1180  return "European"; //NON-NLS
1181  }
1182  }
1183  };
1184  private static final SCRIPT[] SCRIPT_VALUES = SCRIPT.values();
1185  private static final String PROPERTY_FILE = "StringExtract.properties"; //NON-NLS
1189  private static final int UNICODE_TABLE_SIZE = 65536;
1193  private static final char[] unicodeTable = new char[UNICODE_TABLE_SIZE];
1194  private static StringExtractUnicodeTable instance = null; //the singleton instance
1195 
1202  public static synchronized StringExtractUnicodeTable getInstance() {
1203  if (instance == null) {
1204  instance = new StringExtractUnicodeTable();
1205  if (!instance.init()) {
1206  //error condition
1207  instance = null;
1208  }
1209 
1210  }
1211  return instance;
1212  }
1213 
1220  public SCRIPT getScript(int value) {
1221  char scriptVal = unicodeTable[value];
1222  return SCRIPT_VALUES[scriptVal];
1223  }
1224 
1232  public static boolean isGeneric(SCRIPT script) {
1233  return script == SCRIPT.COMMON; // || script == SCRIPT.LATIN_1;
1234  }
1235 
1236  public static int getUnicodeTableSize() {
1237  return UNICODE_TABLE_SIZE;
1238  }
1239 
1246  public static int getScriptValue(SCRIPT script) {
1247  return script.ordinal();
1248  }
1249 
1250  public static SCRIPT scriptForString(String scriptStringVal) {
1251  SCRIPT script = SCRIPT.valueOf(scriptStringVal);
1252  return script;
1253  }
1254 
1260  private boolean init() {
1261  Properties properties = new Properties();
1262  try {
1263  //properties.load(new FileInputStream("StringExtract.properties"));
1264  InputStream inputStream = StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1265  properties.load(inputStream);
1266  String table = properties.getProperty("UnicodeTable");
1267  StringTokenizer st = new StringTokenizer(table, " ");
1268  int toks = st.countTokens();
1269  //logger.log(Level.INFO, "TABLE TOKS: " + toks);
1270  if (toks != UNICODE_TABLE_SIZE) {
1271  logger.log(Level.WARNING, "Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, ", have: " + toks); //NON-NLS
1272  return false;
1273  }
1274 
1275  int tableIndex = 0;
1276  while (st.hasMoreTokens()) {
1277  String tok = st.nextToken();
1278  char code = (char) Integer.parseInt(tok);
1279  unicodeTable[tableIndex++] = code;
1280  }
1281 
1282  logger.log(Level.INFO, "initialized, unicode table loaded"); //NON-NLS
1283 
1284  } catch (IOException ex) {
1285  logger.log(Level.WARNING, "Could not load" + PROPERTY_FILE); //NON-NLS
1286  return false;
1287  }
1288 
1289  return true;
1290 
1291  }
1292  }
1293 }
static final List< SCRIPT > SUPPORTED_SCRIPTS
StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res)
static boolean isExtractionSupported(SCRIPT script)
StringExtractResult extract(byte[] buff, int len, int offset)
final void setEnabledScripts(List< SCRIPT > scripts)
static String extractASCII(byte[] readBuf, int len, int offset)
StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res)
static Logger getLogger(String name)
Definition: Logger.java:131

Copyright © 2012-2015 Basis Technology. Generated on: Mon Oct 19 2015
This work is licensed under a Creative Commons Attribution-Share Alike 3.0 United States License.