19 package org.sleuthkit.autopsy.coreutils;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.Properties;
27 import java.util.StringTokenizer;
28 import java.util.logging.Level;
30 import org.openide.util.NbBundle;
73 private final StringBuilder
curString =
new StringBuilder();
82 if (unicodeTable == null) {
83 throw new IllegalStateException(
84 NbBundle.getMessage(
StringExtract.class,
"StringExtract.illegalStateException.cannotInit.msg"));
116 this.enabledScripts = scripts;
127 this.enabledScripts =
new ArrayList<SCRIPT>();
128 this.enabledScripts.add(script);
138 return SUPPORTED_SCRIPTS.contains(script);
150 if (script.equals(
SCRIPT.LATIN_1)) {
151 return enabledScripts.contains(
SCRIPT.LATIN_1)
152 || enabledScripts.contains(
SCRIPT.LATIN_2);
154 return enabledScripts.contains(script);
164 if (enabledScripts.size() == 1
165 && enabledScripts.get(0).equals(
SCRIPT.LATIN_1)) {
187 if (this.enableUTF16 ==
false && this.enableUTF8 ==
false) {
191 final int buffLen = buff.length;
193 int processedBytes = 0;
194 int curOffset = offset;
195 int startOffset = offset;
196 int curStringLen = 0;
199 curString.delete(0, curString.length());
203 int firstUnprocessedOff = offset;
205 while (curOffset < buffLen) {
207 if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
214 boolean runUTF16 =
false;
215 if (enableUTF16 && curOffset % 2 == 0) {
219 resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 :
resUTF16En2;
227 if (enableUTF8 && enableUTF16) {
228 resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 :
resUTF8;
229 }
else if (enableUTF16){
232 else if (enableUTF8) {
236 if (resWin.numChars >= MIN_CHARS_STRING) {
238 if (startOffset == offset) {
240 startOffset = resWin.offset;
242 curStringLen += resWin.numChars;
243 curString.append(resWin.textString);
244 curString.append(
"\n");
245 curStringLen += resWin.numChars + 1;
248 curOffset += resWin.numBytes;
249 processedBytes += resWin.numBytes;
250 firstUnprocessedOff = resWin.offset + resWin.numBytes;
253 if (enableUTF8 ==
false) {
263 res.numBytes = processedBytes;
264 res.numChars = curStringLen;
265 res.offset = startOffset;
266 res.textString = curString.toString();
267 res.firstUnprocessedOff = firstUnprocessedOff;
275 int curOffset = offset;
277 final StringBuilder tempString =
new StringBuilder();
281 boolean inControl =
false;
284 byte[] b =
new byte[2];
285 while (curOffset < len - 1) {
286 b[0] = buff[curOffset++];
287 b[1] = buff[curOffset++];
298 char byteVal = (char) b[1];
299 byteVal = (char) (byteVal << 8);
310 if (scriptFound ==
SCRIPT.NONE) {
329 if (currentScript ==
SCRIPT.NONE
333 currentScript = scriptFound;
336 if (currentScript == scriptFound
338 if (res.numChars == 0) {
340 res.offset = curOffset;
346 tempString.append(byteVal);
358 res.textString = tempString.toString();
366 int curOffset = offset;
370 final StringBuilder tempString =
new StringBuilder();
374 boolean inControl =
false;
377 while (curOffset < len) {
379 final int curByte = buff[curOffset] & 0xFF;
380 if (curByte <= 0x7F) {
383 }
else if (curByte <= 0xC1) {
385 }
else if (curByte <= 0xDF) {
386 if (len - curOffset < 2) {
389 final int curByte_1 = buff[curOffset + 1] & 0xFF;
390 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
392 ch = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
396 }
else if (curByte == 0xE0) {
397 if (len - curOffset < 3) {
400 final int curByte_1 = buff[curOffset + 1] & 0xFF;
401 final int curByte_2 = buff[curOffset + 2] & 0xFF;
403 if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
404 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
406 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
410 }
else if (curByte <= 0xEC) {
411 if (len - curOffset < 3) {
414 final int curByte_1 = buff[curOffset + 1] & 0xFF;
415 final int curByte_2 = buff[curOffset + 2] & 0xFF;
416 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
417 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
419 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
423 }
else if (curByte == 0xED) {
424 if (len - curOffset < 3) {
427 final int curByte_1 = buff[curOffset + 1] & 0xFF;
428 final int curByte_2 = buff[curOffset + 2] & 0xFF;
429 if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
430 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
432 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
436 }
else if (curByte <= 0xEF) {
437 if (len - curOffset < 3) {
440 final int curByte_1 = buff[curOffset + 1] & 0xFF;
441 final int curByte_2 = buff[curOffset + 2] & 0xFF;
442 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
443 && curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
445 ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
449 }
else if (curByte == 0xF0) {
450 if (len - curOffset < 4) {
453 final int curByte_1 = buff[curOffset + 1] & 0xFF;
454 final int curByte_2 = buff[curOffset + 2] & 0xFF;
455 final int curByte_3 = buff[curOffset + 3] & 0xFF;
456 if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
457 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
458 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
460 ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
464 }
else if (curByte <= 0xF3) {
465 if (len - curOffset < 4) {
468 final int curByte_1 = buff[curOffset + 1] & 0xFF;
469 final int curByte_2 = buff[curOffset + 2] & 0xFF;
470 final int curByte_3 = buff[curOffset + 3] & 0xFF;
471 if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
472 && curByte_2 >= 0x80 && curByte_2 <= 0xBF
473 && curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
475 ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
484 curOffset += chBytes;
494 if (scriptFound ==
SCRIPT.NONE) {
511 if (currentScript ==
SCRIPT.NONE
515 currentScript = scriptFound;
518 if (currentScript == scriptFound
520 if (res.numChars == 0) {
522 res.offset = curOffset;
525 res.numBytes += chBytes;
528 tempString.append((
char) ch);
540 res.textString = tempString.toString();
560 public static String
extractASCII(byte[] readBuf,
int len,
int offset) {
561 final StringBuilder result =
new StringBuilder();
562 StringBuilder temp =
new StringBuilder();
565 final char NL = (char) 10;
566 final String NLS = Character.toString(NL);
567 boolean singleConsecZero =
false;
568 for (
int i = offset; i < len; i++) {
569 char curChar = (char) readBuf[i];
570 if (curChar == 0 && singleConsecZero ==
false) {
572 singleConsecZero =
true;
574 singleConsecZero =
false;
578 temp.append(curChar);
580 }
else if (!singleConsecZero) {
581 if (curLen >= MIN_CHARS_STRING) {
587 temp =
new StringBuilder();
594 return result.toString();
604 return (c >= 32 && c <= 126) || c == 9;
616 int firstUnprocessedOff;
624 firstUnprocessedOff = 0;
629 return firstUnprocessedOff;
652 return o.numChars - numChars;
676 public String getLanguages() {
682 public String getLanguages() {
688 public String toString() {
689 return "Latin - Basic";
693 public String getLanguages() {
699 public String toString() {
704 public String getLanguages() {
710 public String toString() {
715 public String getLanguages() {
716 return "Russian, Bulgarian, Serbian, Moldovan";
721 public String toString() {
726 public String getLanguages() {
732 public String toString() {
737 public String getLanguages() {
743 public String toString() {
748 public String getLanguages() {
754 public String getLanguages() {
760 public String getLanguages() {
766 public String getLanguages() {
772 public String toString() {
777 public String getLanguages() {
783 public String getLanguages() {
789 public String getLanguages() {
795 public String getLanguages() {
801 public String getLanguages() {
807 public String getLanguages() {
813 public String getLanguages() {
819 public String getLanguages() {
825 public String getLanguages() {
831 public String toString() {
836 public String getLanguages() {
842 public String toString() {
847 public String getLanguages() {
853 public String toString() {
858 public String getLanguages() {
864 public String getLanguages() {
870 public String toString() {
875 public String getLanguages() {
881 public String toString() {
886 public String getLanguages() {
892 public String toString() {
897 public String getLanguages() {
903 public String getLanguages() {
907 CANADIAN_ABORIGINAL {
909 public String getLanguages() {
915 public String getLanguages() {
921 public String getLanguages() {
927 public String toString() {
932 public String getLanguages() {
938 public String toString() {
943 public String getLanguages() {
949 public String toString() {
954 public String getLanguages() {
960 public String toString() {
965 public String getLanguages() {
971 public String getLanguages() {
977 public String toString() {
982 public String getLanguages() {
983 return "Chinese, Japanese, Korean";
988 public String getLanguages() {
994 public String getLanguages() {
1000 public String getLanguages() {
1006 public String getLanguages() {
1012 public String getLanguages() {
1018 public String getLanguages() {
1024 public String getLanguages() {
1030 public String getLanguages() {
1036 public String getLanguages() {
1042 public String getLanguages() {
1048 public String getLanguages() {
1054 public String getLanguages() {
1060 public String getLanguages() {
1066 public String getLanguages() {
1072 public String getLanguages() {
1078 public String getLanguages() {
1084 public String getLanguages() {
1090 public String getLanguages() {
1096 public String getLanguages() {
1102 public String getLanguages() {
1108 public String getLanguages() {
1114 public String getLanguages() {
1120 public String getLanguages() {
1126 public String getLanguages() {
1132 public String getLanguages() {
1138 public String getLanguages() {
1144 public String getLanguages() {
1150 public String getLanguages() {
1156 public String getLanguages() {
1162 public String getLanguages() {
1168 public String getLanguages() {
1174 public String toString() {
1175 return "Latin - Extended";
1179 public String getLanguages() {
1203 if (instance == null) {
1205 if (!instance.
init()) {
1221 char scriptVal = unicodeTable[value];
1222 return SCRIPT_VALUES[scriptVal];
1233 return script ==
SCRIPT.COMMON;
1247 return script.ordinal();
1261 Properties properties =
new Properties();
1264 InputStream inputStream =
StringExtract.class.getResourceAsStream(PROPERTY_FILE);
1265 properties.load(inputStream);
1266 String table = properties.getProperty(
"UnicodeTable");
1267 StringTokenizer st =
new StringTokenizer(table,
" ");
1268 int toks = st.countTokens();
1270 if (toks != UNICODE_TABLE_SIZE) {
1271 logger.log(Level.WARNING,
"Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE,
", have: " + toks);
1276 while (st.hasMoreTokens()) {
1277 String tok = st.nextToken();
1278 char code = (char) Integer.parseInt(tok);
1279 unicodeTable[tableIndex++] = code;
1282 logger.log(Level.INFO,
"initialized, unicode table loaded");
1284 }
catch (IOException ex) {
1285 logger.log(Level.WARNING,
"Could not load" + PROPERTY_FILE);
static Logger getLogger(String name)