19 package org.sleuthkit.autopsy.coreutils;
 
   21 import java.io.IOException;
 
   22 import java.io.InputStream;
 
   23 import java.util.ArrayList;
 
   24 import java.util.Arrays;
 
   25 import java.util.List;
 
   26 import java.util.Properties;
 
   27 import java.util.StringTokenizer;
 
   28 import java.util.logging.Level;
 
   30 import org.openide.util.NbBundle;
 
   73     private final StringBuilder 
curString = 
new StringBuilder();
 
   82         if (unicodeTable == null) {
 
   83             throw new IllegalStateException(
 
   84                     NbBundle.getMessage(
StringExtract.class, 
"StringExtract.illegalStateException.cannotInit.msg"));
 
  114         this.enabledScripts = scripts;
 
  124         this.enabledScripts = 
new ArrayList<SCRIPT>();
 
  125         this.enabledScripts.add(script);
 
  149         if (script.equals(
SCRIPT.LATIN_1)) {
 
  150             return enabledScripts.contains(
SCRIPT.LATIN_1)
 
  151                     || enabledScripts.contains(
SCRIPT.LATIN_2);
 
  153             return enabledScripts.contains(script);
 
  164         if (enabledScripts.size() == 1
 
  165                 && enabledScripts.get(0).equals(
SCRIPT.LATIN_1)) {
 
  187         if (this.enableUTF16 == 
false && this.enableUTF8 == 
false) {
 
  191         final int buffLen = buff.length;
 
  193         int processedBytes = 0;
 
  194         int curOffset = offset;
 
  195         int startOffset = offset;
 
  196         int curStringLen = 0;
 
  199         curString.delete(0, curString.length());
 
  203         int firstUnprocessedOff = offset;
 
  205         while (curOffset < buffLen) {
 
  207             if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
 
  214             boolean runUTF16 = 
false;
 
  215             if (enableUTF16 && curOffset % 2 == 0) {
 
  219                 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : 
resUTF16En2;
 
  227             if (enableUTF8 && enableUTF16) {
 
  228                 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : 
resUTF8;
 
  229             } 
else if (enableUTF16) {
 
  231             } 
else if (enableUTF8) {
 
  235             if (resWin.numChars >= MIN_CHARS_STRING) {
 
  237                 if (startOffset == offset) {
 
  239                     startOffset = resWin.offset;
 
  241                 curStringLen += resWin.numChars;
 
  242                 curString.append(resWin.textString);
 
  243                 curString.append(
"\n");
 
  244                 curStringLen += resWin.numChars + 1;
 
  247                 curOffset += resWin.numBytes;
 
  248                 processedBytes += resWin.numBytes;
 
  249                 firstUnprocessedOff = resWin.offset + resWin.numBytes;
 
  252                 if (enableUTF8 == 
false) {
 
  262         res.numBytes = processedBytes;
 
  263         res.numChars = curStringLen;
 
  264         res.offset = startOffset;
 
  265         res.textString = curString.toString();
 
  266         res.firstUnprocessedOff = firstUnprocessedOff; 
 
  274         int curOffset = offset;
 
  276         final StringBuilder tempString = 
new StringBuilder();
 
  280         boolean inControl = 
false;
 
  283         byte[] b = 
new byte[2];
 
  284         while (curOffset < len - 1) {
 
  285             b[0] = buff[curOffset++];
 
  286             b[1] = buff[curOffset++];
 
  297             char byteVal = (char) b[1];
 
  298             byteVal = (char) (byteVal << 8);
 
  309             if (scriptFound == 
SCRIPT.NONE) {
 
  324                 if (currentScript == 
SCRIPT.NONE
 
  328                     currentScript = scriptFound;
 
  331                 if (currentScript == scriptFound
 
  333                     if (res.numChars == 0) {
 
  335                         res.offset = curOffset;
 
  341                     tempString.append(byteVal);
 
  353         res.textString = tempString.toString();
 
  361         int curOffset = offset;
 
  365         final StringBuilder tempString = 
new StringBuilder();
 
  369         boolean inControl = 
false;
 
  372         while (curOffset < len) {
 
  374             final int curByte = buff[curOffset] & 0xFF; 
 
  375             if (curByte <= 0x7F) {
 
  378             } 
else if (curByte <= 0xC1) {
 
  380             } 
else if (curByte <= 0xDF) {
 
  381                 if (len - curOffset < 2) {
 
  384                 final int curByte_1 = buff[curOffset + 1] & 0xFF;
 
  385                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
 
  387                     ch = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
 
  391             } 
else if (curByte == 0xE0) {
 
  392                 if (len - curOffset < 3) {
 
  395                 final int curByte_1 = buff[curOffset + 1] & 0xFF;
 
  396                 final int curByte_2 = buff[curOffset + 2] & 0xFF;
 
  398                 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
 
  399                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  401                     ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  405             } 
else if (curByte <= 0xEC) {
 
  406                 if (len - curOffset < 3) {
 
  409                 final int curByte_1 = buff[curOffset + 1] & 0xFF;
 
  410                 final int curByte_2 = buff[curOffset + 2] & 0xFF;
 
  411                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  412                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  414                     ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  418             } 
else if (curByte == 0xED) {
 
  419                 if (len - curOffset < 3) {
 
  422                 final int curByte_1 = buff[curOffset + 1] & 0xFF;
 
  423                 final int curByte_2 = buff[curOffset + 2] & 0xFF;
 
  424                 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
 
  425                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  427                     ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  431             } 
else if (curByte <= 0xEF) {
 
  432                 if (len - curOffset < 3) {
 
  435                 final int curByte_1 = buff[curOffset + 1] & 0xFF;
 
  436                 final int curByte_2 = buff[curOffset + 2] & 0xFF;
 
  437                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  438                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
 
  440                     ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
 
  444             } 
else if (curByte == 0xF0) {
 
  445                 if (len - curOffset < 4) {
 
  448                 final int curByte_1 = buff[curOffset + 1] & 0xFF;
 
  449                 final int curByte_2 = buff[curOffset + 2] & 0xFF;
 
  450                 final int curByte_3 = buff[curOffset + 3] & 0xFF;
 
  451                 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
 
  452                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF
 
  453                         && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
 
  455                     ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
 
  459             } 
else if (curByte <= 0xF3) {
 
  460                 if (len - curOffset < 4) {
 
  463                 final int curByte_1 = buff[curOffset + 1] & 0xFF;
 
  464                 final int curByte_2 = buff[curOffset + 2] & 0xFF;
 
  465                 final int curByte_3 = buff[curOffset + 3] & 0xFF;
 
  466                 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
 
  467                         && curByte_2 >= 0x80 && curByte_2 <= 0xBF
 
  468                         && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
 
  470                     ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
 
  478             curOffset += chBytes;
 
  488             if (scriptFound == 
SCRIPT.NONE) {
 
  503                 if (currentScript == 
SCRIPT.NONE
 
  507                     currentScript = scriptFound;
 
  510                 if (currentScript == scriptFound
 
  512                     if (res.numChars == 0) {
 
  514                         res.offset = curOffset;
 
  517                     res.numBytes += chBytes;
 
  520                     tempString.append((
char) ch);
 
  532         res.textString = tempString.toString();
 
  553     public static String 
extractASCII(byte[] readBuf, 
int len, 
int offset) {
 
  554         final StringBuilder result = 
new StringBuilder();
 
  555         StringBuilder temp = 
new StringBuilder();
 
  558         final char NL = (char) 10; 
 
  559         final String NLS = Character.toString(NL);
 
  560         boolean singleConsecZero = 
false; 
 
  561         for (
int i = offset; i < len; i++) {
 
  562             char curChar = (char) readBuf[i];
 
  563             if (curChar == 0 && singleConsecZero == 
false) {
 
  565                 singleConsecZero = 
true;
 
  567                 singleConsecZero = 
false;
 
  571                 temp.append(curChar);
 
  573             } 
else if (!singleConsecZero) {
 
  574                 if (curLen >= MIN_CHARS_STRING) {
 
  580                 temp = 
new StringBuilder();
 
  587         return result.toString();
 
  598         return (c >= 32 && c <= 126) || c == 9;
 
  609         int firstUnprocessedOff; 
 
  616             firstUnprocessedOff = 0;
 
  621             return firstUnprocessedOff;
 
  644             return o.numChars - numChars;
 
  668                         public String getLanguages() {
 
  674                         public String getLanguages() {
 
  680                         public String toString() {
 
  681                             return "Latin - Basic"; 
 
  685                         public String getLanguages() {
 
  691                         public String toString() {
 
  696                         public String getLanguages() {
 
  702                         public String toString() {
 
  707                         public String getLanguages() {
 
  708                             return "Russian, Bulgarian, Serbian, Moldovan"; 
 
  713                         public String toString() {
 
  718                         public String getLanguages() {
 
  724                         public String toString() {
 
  729                         public String getLanguages() {
 
  735                         public String toString() {
 
  740                         public String getLanguages() {
 
  746                         public String getLanguages() {
 
  752                         public String getLanguages() {
 
  758                         public String getLanguages() {
 
  764                         public String toString() {
 
  769                         public String getLanguages() {
 
  775                         public String getLanguages() {
 
  781                         public String getLanguages() {
 
  787                         public String getLanguages() {
 
  793                         public String getLanguages() {
 
  799                         public String getLanguages() {
 
  805                         public String getLanguages() {
 
  811                         public String getLanguages() {
 
  817                         public String getLanguages() {
 
  823                         public String toString() {
 
  828                         public String getLanguages() {
 
  834                         public String toString() {
 
  839                         public String getLanguages() {
 
  845                         public String toString() {
 
  850                         public String getLanguages() {
 
  856                         public String getLanguages() {
 
  862                         public String toString() {
 
  867                         public String getLanguages() {
 
  873                         public String toString() {
 
  878                         public String getLanguages() {
 
  884                         public String toString() {
 
  889                         public String getLanguages() {
 
  895                         public String getLanguages() {
 
  899             CANADIAN_ABORIGINAL {
 
  901                         public String getLanguages() {
 
  907                         public String getLanguages() {
 
  913                         public String getLanguages() {
 
  919                         public String toString() {
 
  924                         public String getLanguages() {
 
  930                         public String toString() {
 
  935                         public String getLanguages() {
 
  941                         public String toString() {
 
  946                         public String getLanguages() {
 
  952                         public String toString() {
 
  957                         public String getLanguages() {
 
  963                         public String getLanguages() {
 
  969                         public String toString() {
 
  974                         public String getLanguages() {
 
  975                             return "Chinese, Japanese, Korean"; 
 
  980                         public String getLanguages() {
 
  986                         public String getLanguages() {
 
  992                         public String getLanguages() {
 
  998                         public String getLanguages() {
 
 1004                         public String getLanguages() {
 
 1010                         public String getLanguages() {
 
 1016                         public String getLanguages() {
 
 1022                         public String getLanguages() {
 
 1028                         public String getLanguages() {
 
 1034                         public String getLanguages() {
 
 1040                         public String getLanguages() {
 
 1046                         public String getLanguages() {
 
 1052                         public String getLanguages() {
 
 1058                         public String getLanguages() {
 
 1064                         public String getLanguages() {
 
 1070                         public String getLanguages() {
 
 1076                         public String getLanguages() {
 
 1082                         public String getLanguages() {
 
 1088                         public String getLanguages() {
 
 1094                         public String getLanguages() {
 
 1100                         public String getLanguages() {
 
 1106                         public String getLanguages() {
 
 1112                         public String getLanguages() {
 
 1118                         public String getLanguages() {
 
 1124                         public String getLanguages() {
 
 1130                         public String getLanguages() {
 
 1136                         public String getLanguages() {
 
 1142                         public String getLanguages() {
 
 1148                         public String getLanguages() {
 
 1154                         public String getLanguages() {
 
 1160                         public String getLanguages() {
 
 1166                         public String toString() {
 
 1167                             return "Latin - Extended"; 
 
 1171                         public String getLanguages() {
 
 1195             if (instance == null) {
 
 1197                 if (!instance.
init()) {
 
 1214             char scriptVal = unicodeTable[value];
 
 1215             return SCRIPT_VALUES[scriptVal];
 
 1227             return script == 
SCRIPT.COMMON; 
 
 1242             return script.ordinal();
 
 1256             Properties properties = 
new Properties();
 
 1259                 InputStream inputStream = 
StringExtract.class.getResourceAsStream(PROPERTY_FILE);
 
 1260                 properties.load(inputStream);
 
 1261                 String table = properties.getProperty(
"UnicodeTable");
 
 1262                 StringTokenizer st = 
new StringTokenizer(table, 
" ");
 
 1263                 int toks = st.countTokens();
 
 1265                 if (toks != UNICODE_TABLE_SIZE) {
 
 1266                     logger.log(Level.WARNING, 
"Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, 
", have: " + toks); 
 
 1271                 while (st.hasMoreTokens()) {
 
 1272                     String tok = st.nextToken();
 
 1273                     char code = (char) Integer.parseInt(tok);
 
 1274                     unicodeTable[tableIndex++] = code;
 
 1277                 logger.log(Level.INFO, 
"initialized, unicode table loaded"); 
 
 1279             } 
catch (IOException ex) {
 
 1280                 logger.log(Level.WARNING, 
"Could not load" + PROPERTY_FILE); 
 
synchronized static Logger getLogger(String name)