19 package org.sleuthkit.autopsy.coreutils;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import static java.lang.Byte.toUnsignedInt;
24 import java.util.ArrayList;
25 import java.util.Arrays;
26 import java.util.List;
27 import java.util.Properties;
28 import java.util.StringTokenizer;
29 import java.util.logging.Level;
31 import org.openide.util.NbBundle;
73 private final StringBuilder
curString =
new StringBuilder();
82 if (unicodeTable == null) {
83 throw new IllegalStateException(
84 NbBundle.getMessage(
StringExtract.class,
"StringExtract.illegalStateException.cannotInit.msg"));
114 this.enabledScripts = scripts;
123 this.enabledScripts =
new ArrayList<>();
124 this.enabledScripts.add(script);
148 if (script.equals(
SCRIPT.LATIN_1)) {
149 return enabledScripts.contains(
SCRIPT.LATIN_1)
150 || enabledScripts.contains(
SCRIPT.LATIN_2);
152 return enabledScripts.contains(script);
163 return enabledScripts.size() == 1
164 && enabledScripts.get(0).equals(
SCRIPT.LATIN_1);
182 if (this.enableUTF16 ==
false && this.enableUTF8 ==
false) {
186 final int buffLen = buff.length;
188 int processedBytes = 0;
189 int curOffset = offset;
190 int startOffset = offset;
191 int curStringLen = 0;
194 curString.delete(0, curString.length());
198 int firstUnprocessedOff = offset;
200 while (curOffset < buffLen) {
202 if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
209 boolean runUTF16 =
false;
210 if (enableUTF16 && curOffset % 2 == 0) {
214 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 :
resUTF16En2;
222 if (enableUTF8 && resUTF16 != null) {
223 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 :
resUTF8;
224 }
else if (runUTF16) {
227 }
else if (enableUTF8) {
231 if (resWin != null && resWin.numChars >= MIN_CHARS_STRING) {
233 if (startOffset == offset) {
235 startOffset = resWin.offset;
237 curStringLen += resWin.numChars;
238 curString.append(resWin.textString);
239 curString.append(
"\n");
240 curStringLen += resWin.numChars + 1;
243 curOffset += resWin.numBytes;
244 processedBytes += resWin.numBytes;
245 firstUnprocessedOff = resWin.offset + resWin.numBytes;
248 if (enableUTF8 ==
false) {
258 res.numBytes = processedBytes;
259 res.numChars = curStringLen;
260 res.offset = startOffset;
261 res.textString = curString.toString();
262 res.firstUnprocessedOff = firstUnprocessedOff;
270 int curOffset = offset;
272 final StringBuilder tempString =
new StringBuilder();
277 while (curOffset < len - 1) {
281 msb = toUnsignedInt(buff[curOffset++]);
282 lsb = toUnsignedInt(buff[curOffset++]);
285 lsb = toUnsignedInt(buff[curOffset++]);
286 msb = toUnsignedInt(buff[curOffset++]);
290 char byteVal = (char) msb;
291 byteVal = (char) (byteVal << 8);
302 if (scriptFound ==
SCRIPT.NONE) {
317 if (currentScript ==
SCRIPT.NONE
321 currentScript = scriptFound;
324 if (currentScript == scriptFound
326 if (res.numChars == 0) {
328 res.offset = curOffset;
334 tempString.append(byteVal);
346 res.textString = tempString.toString();
354 int curOffset = offset;
358 final StringBuilder tempString =
new StringBuilder();
363 while (curOffset < len) {
365 final int curByte = toUnsignedInt(buff[curOffset]);
366 if (curByte <= 0x7F) {
369 }
else if (curByte <= 0xC1) {
371 }
else if (curByte <= 0xDF) {
372 if (len - curOffset < 2) {
375 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
376 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
378 curChar = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
382 }
else if (curByte == 0xE0) {
383 if (len - curOffset < 3) {
386 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
387 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
389 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
390 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
392 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
396 }
else if (curByte <= 0xEC) {
397 if (len - curOffset < 3) {
400 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
401 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
402 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
403 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
405 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
409 }
else if (curByte == 0xED) {
410 if (len - curOffset < 3) {
413 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
414 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
415 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
416 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
418 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
422 }
else if (curByte <= 0xEF) {
423 if (len - curOffset < 3) {
426 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
427 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
428 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
429 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
431 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
435 }
else if (curByte == 0xF0) {
436 if (len - curOffset < 4) {
439 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
440 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
441 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
442 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
443 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
444 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
446 curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
450 }
else if (curByte <= 0xF3) {
451 if (len - curOffset < 4) {
454 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
455 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
456 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
457 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
458 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
459 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
461 curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
469 curOffset += chBytes;
479 if (scriptFound ==
SCRIPT.NONE) {
494 if (currentScript ==
SCRIPT.NONE
498 currentScript = scriptFound;
501 if (currentScript == scriptFound
503 if (res.numChars == 0) {
505 res.offset = curOffset;
508 res.numBytes += chBytes;
511 tempString.append((
char) curChar);
523 res.textString = tempString.toString();
544 public static String
extractASCII(byte[] readBuf,
int len,
int offset) {
545 final StringBuilder result =
new StringBuilder();
546 StringBuilder temp =
new StringBuilder();
549 final char NL = (char) 10;
550 final String NLS = Character.toString(NL);
551 boolean singleConsecZero =
false;
552 for (
int i = offset; i < len; i++) {
553 char curChar = (char) toUnsignedInt(readBuf[i]);
554 if (curChar == 0 && singleConsecZero ==
false) {
556 singleConsecZero =
true;
558 singleConsecZero =
false;
562 temp.append(curChar);
564 }
else if (!singleConsecZero) {
565 if (curLen >= MIN_CHARS_STRING) {
571 temp =
new StringBuilder();
578 return result.toString();
589 return (c >= 32 && c <= 126) || c == 9;
600 int firstUnprocessedOff;
607 firstUnprocessedOff = 0;
612 return firstUnprocessedOff;
635 return o.numChars - numChars;
659 public String getLanguages() {
665 public String getLanguages() {
671 public String toString() {
672 return "Latin - Basic";
676 public String getLanguages() {
682 public String toString() {
687 public String getLanguages() {
693 public String toString() {
698 public String getLanguages() {
699 return "Russian, Bulgarian, Serbian, Moldovan";
704 public String toString() {
709 public String getLanguages() {
715 public String toString() {
720 public String getLanguages() {
726 public String toString() {
731 public String getLanguages() {
737 public String getLanguages() {
743 public String getLanguages() {
749 public String getLanguages() {
755 public String toString() {
760 public String getLanguages() {
766 public String getLanguages() {
772 public String getLanguages() {
778 public String getLanguages() {
784 public String getLanguages() {
790 public String getLanguages() {
796 public String getLanguages() {
802 public String getLanguages() {
808 public String getLanguages() {
814 public String toString() {
819 public String getLanguages() {
825 public String toString() {
830 public String getLanguages() {
836 public String toString() {
841 public String getLanguages() {
847 public String getLanguages() {
853 public String toString() {
858 public String getLanguages() {
864 public String toString() {
869 public String getLanguages() {
875 public String toString() {
880 public String getLanguages() {
886 public String getLanguages() {
890 CANADIAN_ABORIGINAL {
892 public String getLanguages() {
898 public String getLanguages() {
904 public String getLanguages() {
910 public String toString() {
915 public String getLanguages() {
921 public String toString() {
926 public String getLanguages() {
932 public String toString() {
937 public String getLanguages() {
943 public String toString() {
948 public String getLanguages() {
954 public String getLanguages() {
960 public String toString() {
965 public String getLanguages() {
966 return "Chinese, Japanese, Korean";
971 public String getLanguages() {
977 public String getLanguages() {
983 public String getLanguages() {
989 public String getLanguages() {
995 public String getLanguages() {
1001 public String getLanguages() {
1007 public String getLanguages() {
1013 public String getLanguages() {
1019 public String getLanguages() {
1025 public String getLanguages() {
1031 public String getLanguages() {
1037 public String getLanguages() {
1043 public String getLanguages() {
1049 public String getLanguages() {
1055 public String getLanguages() {
1061 public String getLanguages() {
1067 public String getLanguages() {
1073 public String getLanguages() {
1079 public String getLanguages() {
1085 public String getLanguages() {
1091 public String getLanguages() {
1097 public String getLanguages() {
1103 public String getLanguages() {
1109 public String getLanguages() {
1115 public String getLanguages() {
1121 public String getLanguages() {
1127 public String getLanguages() {
1133 public String getLanguages() {
1139 public String getLanguages() {
1145 public String getLanguages() {
1151 public String getLanguages() {
1157 public String toString() {
1158 return "Latin - Extended";
1162 public String getLanguages() {
1186 if (instance == null) {
1188 if (!instance.
init()) {
1205 char scriptVal = UNICODE_TABLE[value];
1206 return SCRIPT_VALUES[scriptVal];
1218 return script ==
SCRIPT.COMMON;
1233 return script.ordinal();
1247 Properties properties =
new Properties();
1250 InputStream inputStream =
StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1251 properties.load(inputStream);
1252 String table = properties.getProperty(
"UnicodeTable");
1253 StringTokenizer st =
new StringTokenizer(table,
" ");
1254 int toks = st.countTokens();
1256 if (toks != UNICODE_TABLE_SIZE) {
1257 logger.log(Level.WARNING,
"Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE,
", have: " + toks);
1262 while (st.hasMoreTokens()) {
1263 String tok = st.nextToken();
1264 char code = (char) Integer.parseInt(tok);
1265 UNICODE_TABLE[tableIndex++] = code;
1268 logger.log(Level.INFO,
"initialized, unicode table loaded");
1270 }
catch (IOException ex) {
1271 logger.log(Level.WARNING,
"Could not load" + PROPERTY_FILE);
synchronized static Logger getLogger(String name)