utf-8 string prediction

This commit is contained in:
sirjonasxx 2021-01-21 04:32:12 +01:00
parent fbef4ad4ab
commit e4c5a941ee

View File

@ -64,6 +64,21 @@ public class StringChecker extends TypeChecker<String> {
}; };
for (int i = 0; i < s.length(); i++) { for (int i = 0; i < s.length(); i++) {
// detect UTF8 extended chars
if ((asBytes[i] & 0b11100000) == 0b11000000 && i < s.length() - 1 && (asBytes[i+1] & 0b11000000) == 0b10000000) {
i += 1;
score *= penalties[2]*penalties[2];
}
else if ((asBytes[i] & 0b11110000) == 0b11100000 && i < s.length() - 2 && (asBytes[i+1] & 0b11000000) == 0b10000000 && (asBytes[i+2] & 0b11000000) == 0b10000000) {
i += 2;
score *= penalties[2]*penalties[2]*penalties[2];
}
else if ((asBytes[i] & 0b11111000) == 0b11110000 && i < s.length() - 3 && (asBytes[i+1] & 0b11000000) == 0b10000000 && (asBytes[i+2] & 0b11000000) == 0b10000000 && (asBytes[i+3] & 0b11000000) == 0b10000000) {
i += 3;
score *= penalties[2]*penalties[2]*penalties[2]*penalties[2];
}
else {
score *= penalties[isCommon( score *= penalties[isCommon(
asChars[i], asChars[i],
asBytes[i] asBytes[i]
@ -73,6 +88,7 @@ public class StringChecker extends TypeChecker<String> {
return 0; return 0;
} }
} }
}
return score; return score;
} }