19 package org.sleuthkit.autopsy.coreutils;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import static java.lang.Byte.toUnsignedInt;
24 import java.util.ArrayList;
25 import java.util.Arrays;
26 import java.util.List;
27 import java.util.Properties;
28 import java.util.StringTokenizer;
29 import java.util.logging.Level;
31 import org.openide.util.NbBundle;
73 private final StringBuilder
curString =
new StringBuilder();
82 if (unicodeTable == null) {
83 throw new IllegalStateException(
84 NbBundle.getMessage(
StringExtract.class,
"StringExtract.illegalStateException.cannotInit.msg"));
114 this.enabledScripts = scripts;
123 this.enabledScripts =
new ArrayList<>();
124 this.enabledScripts.add(script);
148 if (script.equals(
SCRIPT.LATIN_1)) {
149 return enabledScripts.contains(
SCRIPT.LATIN_1)
150 || enabledScripts.contains(
SCRIPT.LATIN_2);
152 return enabledScripts.contains(script);
163 return enabledScripts.size() == 1
164 && enabledScripts.get(0).equals(
SCRIPT.LATIN_1);
182 if (this.enableUTF16 ==
false && this.enableUTF8 ==
false) {
186 final int buffLen = buff.length;
188 int processedBytes = 0;
189 int curOffset = offset;
190 int startOffset = offset;
191 int curStringLen = 0;
194 curString.delete(0, curString.length());
198 int firstUnprocessedOff = offset;
200 while (curOffset < buffLen) {
202 if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
209 boolean runUTF16 =
false;
210 if (enableUTF16 && curOffset % 2 == 0) {
214 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 :
resUTF16En2;
222 if (enableUTF8 && resUTF16 != null) {
223 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 :
resUTF8;
224 }
else if (enableUTF16) {
226 }
else if (enableUTF8) {
230 if (resWin != null && resWin.numChars >= MIN_CHARS_STRING) {
232 if (startOffset == offset) {
234 startOffset = resWin.offset;
236 curStringLen += resWin.numChars;
237 curString.append(resWin.textString);
238 curString.append(
"\n");
239 curStringLen += resWin.numChars + 1;
242 curOffset += resWin.numBytes;
243 processedBytes += resWin.numBytes;
244 firstUnprocessedOff = resWin.offset + resWin.numBytes;
247 if (enableUTF8 ==
false) {
257 res.numBytes = processedBytes;
258 res.numChars = curStringLen;
259 res.offset = startOffset;
260 res.textString = curString.toString();
261 res.firstUnprocessedOff = firstUnprocessedOff;
269 int curOffset = offset;
271 final StringBuilder tempString =
new StringBuilder();
276 while (curOffset < len - 1) {
280 msb = toUnsignedInt(buff[curOffset++]);
281 lsb = toUnsignedInt(buff[curOffset++]);
284 lsb = toUnsignedInt(buff[curOffset++]);
285 msb = toUnsignedInt(buff[curOffset++]);
289 char byteVal = (char) msb;
290 byteVal = (char) (byteVal << 8);
301 if (scriptFound ==
SCRIPT.NONE) {
316 if (currentScript ==
SCRIPT.NONE
320 currentScript = scriptFound;
323 if (currentScript == scriptFound
325 if (res.numChars == 0) {
327 res.offset = curOffset;
333 tempString.append(byteVal);
345 res.textString = tempString.toString();
353 int curOffset = offset;
357 final StringBuilder tempString =
new StringBuilder();
362 while (curOffset < len) {
364 final int curByte = toUnsignedInt(buff[curOffset]);
365 if (curByte <= 0x7F) {
368 }
else if (curByte <= 0xC1) {
370 }
else if (curByte <= 0xDF) {
371 if (len - curOffset < 2) {
374 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
375 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
377 curChar = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
381 }
else if (curByte == 0xE0) {
382 if (len - curOffset < 3) {
385 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
386 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
388 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
389 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
391 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
395 }
else if (curByte <= 0xEC) {
396 if (len - curOffset < 3) {
399 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
400 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
401 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
402 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
404 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
408 }
else if (curByte == 0xED) {
409 if (len - curOffset < 3) {
412 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
413 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
414 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
415 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
417 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
421 }
else if (curByte <= 0xEF) {
422 if (len - curOffset < 3) {
425 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
426 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
427 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
428 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
430 curChar = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
434 }
else if (curByte == 0xF0) {
435 if (len - curOffset < 4) {
438 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
439 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
440 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
441 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
442 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
443 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
445 curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
449 }
else if (curByte <= 0xF3) {
450 if (len - curOffset < 4) {
453 final int curByte_1 = toUnsignedInt(buff[curOffset + 1]);
454 final int curByte_2 = toUnsignedInt(buff[curOffset + 2]);
455 final int curByte_3 = toUnsignedInt(buff[curOffset + 3]);
456 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
457 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
458 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
460 curChar = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
468 curOffset += chBytes;
478 if (scriptFound ==
SCRIPT.NONE) {
493 if (currentScript ==
SCRIPT.NONE
497 currentScript = scriptFound;
500 if (currentScript == scriptFound
502 if (res.numChars == 0) {
504 res.offset = curOffset;
507 res.numBytes += chBytes;
510 tempString.append((
char) curChar);
522 res.textString = tempString.toString();
543 public static String
extractASCII(byte[] readBuf,
int len,
int offset) {
544 final StringBuilder result =
new StringBuilder();
545 StringBuilder temp =
new StringBuilder();
548 final char NL = (char) 10;
549 final String NLS = Character.toString(NL);
550 boolean singleConsecZero =
false;
551 for (
int i = offset; i < len; i++) {
552 char curChar = (char) toUnsignedInt(readBuf[i]);
553 if (curChar == 0 && singleConsecZero ==
false) {
555 singleConsecZero =
true;
557 singleConsecZero =
false;
561 temp.append(curChar);
563 }
else if (!singleConsecZero) {
564 if (curLen >= MIN_CHARS_STRING) {
570 temp =
new StringBuilder();
577 return result.toString();
588 return (c >= 32 && c <= 126) || c == 9;
599 int firstUnprocessedOff;
606 firstUnprocessedOff = 0;
611 return firstUnprocessedOff;
634 return o.numChars - numChars;
658 public String getLanguages() {
664 public String getLanguages() {
670 public String toString() {
671 return "Latin - Basic";
675 public String getLanguages() {
681 public String toString() {
686 public String getLanguages() {
692 public String toString() {
697 public String getLanguages() {
698 return "Russian, Bulgarian, Serbian, Moldovan";
703 public String toString() {
708 public String getLanguages() {
714 public String toString() {
719 public String getLanguages() {
725 public String toString() {
730 public String getLanguages() {
736 public String getLanguages() {
742 public String getLanguages() {
748 public String getLanguages() {
754 public String toString() {
759 public String getLanguages() {
765 public String getLanguages() {
771 public String getLanguages() {
777 public String getLanguages() {
783 public String getLanguages() {
789 public String getLanguages() {
795 public String getLanguages() {
801 public String getLanguages() {
807 public String getLanguages() {
813 public String toString() {
818 public String getLanguages() {
824 public String toString() {
829 public String getLanguages() {
835 public String toString() {
840 public String getLanguages() {
846 public String getLanguages() {
852 public String toString() {
857 public String getLanguages() {
863 public String toString() {
868 public String getLanguages() {
874 public String toString() {
879 public String getLanguages() {
885 public String getLanguages() {
889 CANADIAN_ABORIGINAL {
891 public String getLanguages() {
897 public String getLanguages() {
903 public String getLanguages() {
909 public String toString() {
914 public String getLanguages() {
920 public String toString() {
925 public String getLanguages() {
931 public String toString() {
936 public String getLanguages() {
942 public String toString() {
947 public String getLanguages() {
953 public String getLanguages() {
959 public String toString() {
964 public String getLanguages() {
965 return "Chinese, Japanese, Korean";
970 public String getLanguages() {
976 public String getLanguages() {
982 public String getLanguages() {
988 public String getLanguages() {
994 public String getLanguages() {
1000 public String getLanguages() {
1006 public String getLanguages() {
1012 public String getLanguages() {
1018 public String getLanguages() {
1024 public String getLanguages() {
1030 public String getLanguages() {
1036 public String getLanguages() {
1042 public String getLanguages() {
1048 public String getLanguages() {
1054 public String getLanguages() {
1060 public String getLanguages() {
1066 public String getLanguages() {
1072 public String getLanguages() {
1078 public String getLanguages() {
1084 public String getLanguages() {
1090 public String getLanguages() {
1096 public String getLanguages() {
1102 public String getLanguages() {
1108 public String getLanguages() {
1114 public String getLanguages() {
1120 public String getLanguages() {
1126 public String getLanguages() {
1132 public String getLanguages() {
1138 public String getLanguages() {
1144 public String getLanguages() {
1150 public String getLanguages() {
1156 public String toString() {
1157 return "Latin - Extended";
1161 public String getLanguages() {
1185 if (instance == null) {
1187 if (!instance.
init()) {
1204 char scriptVal = UNICODE_TABLE[value];
1205 return SCRIPT_VALUES[scriptVal];
1217 return script ==
SCRIPT.COMMON;
1232 return script.ordinal();
1246 Properties properties =
new Properties();
1249 InputStream inputStream =
StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1250 properties.load(inputStream);
1251 String table = properties.getProperty(
"UnicodeTable");
1252 StringTokenizer st =
new StringTokenizer(table,
" ");
1253 int toks = st.countTokens();
1255 if (toks != UNICODE_TABLE_SIZE) {
1256 logger.log(Level.WARNING,
"Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE,
", have: " + toks);
1261 while (st.hasMoreTokens()) {
1262 String tok = st.nextToken();
1263 char code = (char) Integer.parseInt(tok);
1264 UNICODE_TABLE[tableIndex++] = code;
1267 logger.log(Level.INFO,
"initialized, unicode table loaded");
1269 }
catch (IOException ex) {
1270 logger.log(Level.WARNING,
"Could not load" + PROPERTY_FILE);
synchronized static Logger getLogger(String name)