GetPhonebookIndex: Fix handling for minor Japanese kana characters
This adds minor Japanese kana character handling to meet the
unicode collation algorithm.
-Normalize digraphs (yori, koto) into first reading letter.
-Treat dakuten, dot, onbiki, iteration marks as a symbol.
-Add handling for minor small katakana letter.
Signed-off-by: Yutaro Ogasawara <yutaro.ogasawara@gmail.com>
diff --git a/android/PhonebookIndex.cpp b/android/PhonebookIndex.cpp
index 8f5e4be..5cc26e5 100644
--- a/android/PhonebookIndex.cpp
+++ b/android/PhonebookIndex.cpp
@@ -169,19 +169,23 @@
c = android::GetNormalizedCodePoint(c, next, NULL);
// Traditional grouping of Hiragana characters
- if (0x3042 <= c && c <= 0x309F) {
+ if (0x3041 <= c && c <= 0x309F) {
if (c < 0x304B) c = 0x3042; // a
else if (c < 0x3055) c = 0x304B; // ka
else if (c < 0x305F) c = 0x3055; // sa
else if (c < 0x306A) c = 0x305F; // ta
else if (c < 0x306F) c = 0x306A; // na
else if (c < 0x307E) c = 0x306F; // ha
- else if (c < 0x3084) c = 0x307E; // ma
+ else if (c < 0x3083) c = 0x307E; // ma
else if (c < 0x3089) c = 0x3084; // ya
- else if (c < 0x308F) c = 0x3089; // ra
- else c = 0x308F; // wa
+ else if (c < 0x308E) c = 0x3089; // ra
+ else if (c < 0x3094) c = 0x308F; // wa
+ else return 0; // Others are not readable
out[0] = c;
return 1;
+ } else if (0x30A0 <= c && c <= 0x30FF) {
+ // Dot, onbiki, iteration marks are not readable
+ return 0;
}
if (is_CJK(c)) {
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
index 0b971d2..796eaa2 100644
--- a/android/PhoneticStringUtils.cpp
+++ b/android/PhoneticStringUtils.cpp
@@ -153,12 +153,45 @@
case 0x3045:
case 0x3047:
case 0x3049:
+ case 0x3063:
+ case 0x3083:
+ case 0x3085:
+ case 0x3087:
case 0x308E: // xwa
return codepoint + 1;
case 0x3095: // xka
return 0x304B;
- case 0x3096: // xku
+ case 0x3096: // xke
+ return 0x3051;
+ case 0x31F0: // xku
return 0x304F;
+ case 0x31F1: // xsi
+ return 0x3057;
+ case 0x31F2: // xsu
+ return 0x3059;
+ case 0x31F3: // xto
+ return 0x3068;
+ case 0x31F4: // xnu
+ return 0x306C;
+ case 0x31F5: // xha
+ return 0x306F;
+ case 0x31F6: // xhi
+ return 0x3072;
+ case 0x31F7: // xhu
+ return 0x3075;
+ case 0x31F8: // xhe
+ return 0x3078;
+ case 0x31F9: // xho
+ return 0x307B;
+ case 0x31FA: // xmu
+ return 0x3080;
+ case 0x31FB: // xra
+ case 0x31FC: // xri
+ case 0x31FD: // xru
+ case 0x31FE: // xre
+ case 0x31FF: // xro
+ // ra: 0x3089
+ return 0x3089 + (codepoint - 0x31FB);
default:
return codepoint;
}
@@ -172,6 +205,12 @@
// Make fullwidth katakana same as hiragana.
// 96 == 0x30A1 - 0x3041c
codepoint = codepoint - 96;
+ } else if (codepoint == 0x309F) {
+ // Digraph YORI; Yo
+ codepoint = 0x3088;
+ } else if (codepoint == 0x30FF) {
+ // Digraph KOTO; Ko
+ codepoint = 0x3053;
} else {
codepoint = GetHiraganaFromHalfwidthKatakana(
codepoint, next_codepoint, next_is_consumed);