Fix contacts index labels for i18n

Switch ContactsProvider to using ICU for generation of index labels,
and remove custom KO and JA code. Add i18n test cases.

Bug:7351596
Change-Id: I7ac25add8b29ff2c6c395f04a83b279b541e4125
diff --git a/android/Android.mk b/android/Android.mk
index 151a5cb..0bb78d3 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -2,7 +2,6 @@
 
 libsqlite3_android_local_src_files := \
 	PhoneNumberUtils.cpp \
-	PhoneticStringUtils.cpp \
 	OldPhoneNumberUtils.cpp \
 	PhonebookIndex.cpp \
 	sqlite3_android.cpp
@@ -10,7 +9,8 @@
 libsqlite3_android_c_includes := \
         external/sqlite/dist \
         external/icu4c/i18n \
-        external/icu4c/common
+        external/icu4c/common \
+        frameworks/native/include
 
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES:= $(libsqlite3_android_local_src_files)
@@ -26,24 +26,6 @@
     include $(BUILD_HOST_STATIC_LIBRARY)
 endif
 
-# Test for PhoneticStringUtils
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= libsqlite3_phonetic_string_utils_test
-
-LOCAL_CFLAGS += -Wall -Werror
-
-LOCAL_SRC_FILES := \
-	PhoneticStringUtils.cpp \
-	PhoneticStringUtilsTest.cpp
-
-LOCAL_MODULE_TAGS := optional
-
-LOCAL_SHARED_LIBRARIES := \
-	libutils
-
-include $(BUILD_EXECUTABLE)
-
 # Test for PhoneNumberUtils
 #
 # You can also test this in Unix, like this:
@@ -71,3 +53,28 @@
 LOCAL_MODULE_TAGS := optional
 
 include $(BUILD_EXECUTABLE)
+
+ifeq ($(WITH_HOST_DALVIK),true)
+  include $(CLEAR_VARS)
+
+  LOCAL_MODULE:= libsqlite3_phone_book_index_test
+
+  LOCAL_SRC_FILES := \
+	PhonebookIndex.cpp \
+	PhonebookIndexTest.cpp
+
+  LOCAL_C_INCLUDES := \
+        external/icu4c/i18n \
+        external/icu4c/common \
+        frameworks/native/include
+
+  LOCAL_MODULE_TAGS := optional
+
+  LOCAL_SHARED_LIBRARIES := \
+	libicui18n libicuuc
+
+  LOCAL_STATIC_LIBRARIES := \
+	libutils libcutils
+
+  include $(BUILD_HOST_EXECUTABLE)
+endif
diff --git a/android/PhonebookIndex.cpp b/android/PhonebookIndex.cpp
index 5cc26e5..68674f4 100644
--- a/android/PhonebookIndex.cpp
+++ b/android/PhonebookIndex.cpp
@@ -14,192 +14,193 @@
  * limitations under the License.
  */
 
+#include <stdlib.h>
 #include <ctype.h>
 #include <string.h>
+#include <stdio.h>
 
+#include <unicode/alphaindex.h>
 #include <unicode/ucol.h>
 #include <unicode/uiter.h>
 #include <unicode/ustring.h>
 #include <unicode/utypes.h>
+#include <unicode/uloc.h>
+#include <utils/Mutex.h>
+#include <utils/RefBase.h>
 
 #include "PhonebookIndex.h"
-#include "PhoneticStringUtils.h"
 
 #define MIN_OUTPUT_SIZE 6       // Minimum required size for the output buffer (in bytes)
 
 namespace android {
 
-// IMPORTANT!  Keep the codes below SORTED. We are doing a binary search on the array
-static UChar DEFAULT_CHAR_MAP[] = {
-    0x00C6,    'A',       // AE
-    0x00DF,    'S',       // Etzett
-    0x1100, 0x3131,       // HANGUL LETTER KIYEOK
-    0x1101, 0x3132,       // HANGUL LETTER SSANGKIYEOK
-    0x1102, 0x3134,       // HANGUL LETTER NIEUN
-    0x1103, 0x3137,       // HANGUL LETTER TIKEUT
-    0x1104, 0x3138,       // HANGUL LETTER SSANGTIKEUT
-    0x1105, 0x3139,       // HANGUL LETTER RIEUL
-    0x1106, 0x3141,       // HANGUL LETTER MIEUM
-    0x1107, 0x3142,       // HANGUL LETTER PIEUP
-    0x1108, 0x3143,       // HANGUL LETTER SSANGPIEUP
-    0x1109, 0x3145,       // HANGUL LETTER SIOS
-    0x110A, 0x3146,       // HANGUL LETTER SSANGSIOS
-    0x110B, 0x3147,       // HANGUL LETTER IEUNG
-    0x110C, 0x3148,       // HANGUL LETTER CIEUC
-    0x110D, 0x3149,       // HANGUL LETTER SSANGCIEUC
-    0x110E, 0x314A,       // HANGUL LETTER CHIEUCH
-    0x110F, 0x314B,       // HANGUL LETTER KHIEUKH
-    0x1110, 0x314C,       // HANGUL LETTER THIEUTH
-    0x1111, 0x314D,       // HANGUL LETTER PHIEUPH
-    0x1112, 0x314E,       // HANGUL LETTER HIEUH
-    0x111A, 0x3140,       // HANGUL LETTER RIEUL-HIEUH
-    0x1121, 0x3144,       // HANGUL LETTER PIEUP-SIOS
-    0x1161, 0x314F,       // HANGUL LETTER A
-    0x1162, 0x3150,       // HANGUL LETTER AE
-    0x1163, 0x3151,       // HANGUL LETTER YA
-    0x1164, 0x3152,       // HANGUL LETTER YAE
-    0x1165, 0x3153,       // HANGUL LETTER EO
-    0x1166, 0x3154,       // HANGUL LETTER E
-    0x1167, 0x3155,       // HANGUL LETTER YEO
-    0x1168, 0x3156,       // HANGUL LETTER YE
-    0x1169, 0x3157,       // HANGUL LETTER O
-    0x116A, 0x3158,       // HANGUL LETTER WA
-    0x116B, 0x3159,       // HANGUL LETTER WAE
-    0x116C, 0x315A,       // HANGUL LETTER OE
-    0x116D, 0x315B,       // HANGUL LETTER YO
-    0x116E, 0x315C,       // HANGUL LETTER U
-    0x116F, 0x315D,       // HANGUL LETTER WEO
-    0x1170, 0x315E,       // HANGUL LETTER WE
-    0x1171, 0x315F,       // HANGUL LETTER WI
-    0x1172, 0x3160,       // HANGUL LETTER YU
-    0x1173, 0x3161,       // HANGUL LETTER EU
-    0x1174, 0x3162,       // HANGUL LETTER YI
-    0x1175, 0x3163,       // HANGUL LETTER I
-    0x11AA, 0x3133,       // HANGUL LETTER KIYEOK-SIOS
-    0x11AC, 0x3135,       // HANGUL LETTER NIEUN-CIEUC
-    0x11AD, 0x3136,       // HANGUL LETTER NIEUN-HIEUH
-    0x11B0, 0x313A,       // HANGUL LETTER RIEUL-KIYEOK
-    0x11B1, 0x313B,       // HANGUL LETTER RIEUL-MIEUM
-    0x11B3, 0x313D,       // HANGUL LETTER RIEUL-SIOS
-    0x11B4, 0x313E,       // HANGUL LETTER RIEUL-THIEUTH
-    0x11B5, 0x313F,       // HANGUL LETTER RIEUL-PHIEUPH
+// Wrapper class to enable using libutil SmartPointers with AlphabeticIndex.
+class AlphabeticIndexRef : public RefBase {
+public:
+    AlphabeticIndexRef(const char *locale, UErrorCode &status) :
+        m_index(locale, status), m_locale(NULL), m_isJapanese(false) {
+        if (U_FAILURE(status)) {
+            return;
+        }
+        m_locale = strdup(locale);
+        if (m_locale == NULL) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+        char language[4];
+        uloc_getLanguage(locale, language, sizeof(language), &status);
+        if (U_FAILURE(status)) {
+            return;
+        }
+        m_isJapanese = (strcmp(language, ULOC_JAPANESE) == 0);
+    }
+    virtual ~AlphabeticIndexRef() { free(m_locale); }
+
+    AlphabeticIndex& operator*() { return m_index; }
+    AlphabeticIndex* operator->() { return &m_index; }
+
+    bool isLocale(const char *locale) const {
+        return (locale != NULL && m_locale != NULL &&
+                strcmp(m_locale, locale) == 0);
+    }
+    bool isJapanese() const { return m_isJapanese; }
+    int32_t getLabel(int32_t bucketIndex, UChar *labelBuf, int32_t labelBufSize);
+
+private:
+    AlphabeticIndex m_index;
+    char *m_locale;
+    bool m_isJapanese;
 };
 
-/**
- * Binary search to map an individual character to the corresponding phone book index.
- */
-static UChar map_character(UChar c, UChar * char_map, int32_t length) {
-  int from = 0, to = length;
-  while (from < to) {
-    int m = ((to + from) >> 1) & ~0x1;    // Only consider even positions
-    UChar cm = char_map[m];
-    if (cm == c) {
-      return char_map[m + 1];
-    } else if (cm < c) {
-      from = m + 2;
-    } else {
-      to = m;
+int32_t AlphabeticIndexRef::getLabel(int32_t bucketIndex, UChar *labelBuf,
+                                     int32_t labelBufSize) {
+    UErrorCode status = U_ZERO_ERROR;
+    m_index.resetBucketIterator(status);
+    if (U_FAILURE(status)) {
+        return -1;
     }
-  }
-  return 0;
+    for(int i = 0; i <= bucketIndex; ++i) {
+        if (!m_index.nextBucket(status) || U_FAILURE(status)) {
+            return -1;
+        }
+    }
+
+    int32_t len;
+    if (m_index.getBucketLabelType() == U_ALPHAINDEX_NORMAL) {
+        len = m_index.getBucketLabel().extract(labelBuf, labelBufSize, status);
+        if (U_FAILURE(status)) {
+            return -1;
+        }
+    } else {
+        // Use no label for underflow/inflow/overflow buckets
+        labelBuf[0] = '\0';
+        len = 0;
+    }
+    return len;
 }
 
+static Mutex gIndexMutex;
+static sp<AlphabeticIndexRef> gIndex;
+
 /**
  * Returns TRUE if the character belongs to a Hanzi unicode block
  */
-static bool is_CJK(UChar c) {
-  return
-       (0x4e00 <= c && c <= 0x9fff)     // CJK_UNIFIED_IDEOGRAPHS
-    || (0x3400 <= c && c <= 0x4dbf)     // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
-    || (0x3000 <= c && c <= 0x303f)     // CJK_SYMBOLS_AND_PUNCTUATION
-    || (0x2e80 <= c && c <= 0x2eff)     // CJK_RADICALS_SUPPLEMENT
-    || (0x3300 <= c && c <= 0x33ff)     // CJK_COMPATIBILITY
-    || (0xfe30 <= c && c <= 0xfe4f)     // CJK_COMPATIBILITY_FORMS
-    || (0xf900 <= c && c <= 0xfaff);    // CJK_COMPATIBILITY_IDEOGRAPHS
+static bool is_CJ(UChar32 c) {
+    return (uscript_hasScript(c, USCRIPT_HAN) ||
+            uscript_hasScript(c, USCRIPT_HIRAGANA) ||
+            uscript_hasScript(c, USCRIPT_KATAKANA));
 }
 
-int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
-        UBool * isError)
+static bool initIndexForLocale(const char *locale) {
+    if (locale == NULL) {
+        return false;
+    }
+
+    if (gIndex != NULL && gIndex->isLocale(locale)) {
+        return true;
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    sp<AlphabeticIndexRef> newIndex(new AlphabeticIndexRef(locale, status));
+    if (newIndex == NULL || U_FAILURE(status)) {
+        return false;
+    }
+    // Always create labels for Latin characters if not present in native set
+    (*newIndex)->addLabels("en", status);
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    if ((*newIndex)->getBucketCount(status) <= 0 || U_FAILURE(status)) {
+        return false;
+    }
+
+    gIndex = newIndex;
+    return true;
+}
+
+int32_t GetPhonebookIndex(UCharIterator *iter, const char *locale,
+                          UChar *out, int32_t size, UBool *isError)
 {
-  if (size < MIN_OUTPUT_SIZE) {
-    *isError = TRUE;
-    return 0;
-  }
-
-  *isError = FALSE;
-
-  // Normalize the first character to remove accents using the NFD normalization
-  UErrorCode errorCode = U_ZERO_ERROR;
-  int32_t len = unorm_next(iter, out, size, UNORM_NFD,
-          0 /* options */, TRUE /* normalize */, NULL, &errorCode);
-  if (U_FAILURE(errorCode)) {
-    *isError = TRUE;
-    return 0;
-  }
-
-  if (len == 0) {   // Empty input string
-    return 0;
-  }
-
-  UChar c = out[0];
-
-  if (!u_isalpha(c)) {
-    // Digits go into a # section. Everything else goes into the empty section
-    // The unicode function u_isdigit would also identify other characters as digits (arabic),
-    // but if we caught them here we'd risk having the same section before and after alpha-letters
-    // which might break the assumption that each section exists only once
-    if (c >= '0' && c <= '9') {
-      out[0] = '#';
-      return 1;
+    if (size < MIN_OUTPUT_SIZE) {
+        *isError = TRUE;
+        return 0;
     }
-    return 0;
-  }
 
-  c = u_toupper(c);
-
-  // Check for explicitly mapped characters
-  UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
-  if (c_mapped != 0) {
-    out[0] = c_mapped;
-    return 1;
-  }
-
-  // Convert Kanas to Hiragana
-  UChar next = len > 2 ? out[1] : 0;
-  c = android::GetNormalizedCodePoint(c, next, NULL);
-
-  // Traditional grouping of Hiragana characters
-  if (0x3041 <= c && c <= 0x309F) {
-    if (c < 0x304B) c = 0x3042;         // a
-    else if (c < 0x3055) c = 0x304B;    // ka
-    else if (c < 0x305F) c = 0x3055;    // sa
-    else if (c < 0x306A) c = 0x305F;    // ta
-    else if (c < 0x306F) c = 0x306A;    // na
-    else if (c < 0x307E) c = 0x306F;    // ha
-    else if (c < 0x3083) c = 0x307E;    // ma
-    else if (c < 0x3089) c = 0x3084;    // ya
-    else if (c < 0x308E) c = 0x3089;    // ra
-    else if (c < 0x3094) c = 0x308F;    // wa
-    else return 0;                      // Others are not readable
-    out[0] = c;
-    return 1;
-  } else if (0x30A0 <= c && c <= 0x30FF) {
-    // Dot, onbiki, iteration marks are not readable
-    return 0;
-  }
-
-  if (is_CJK(c)) {
-    if (strncmp(locale, "ja", 2) == 0) {
-      // Japanese word meaning "misc" or "other"
-      out[0] = 0x4ED6;
-      return 1;
-    } else {
-      return 0;
+    *isError = FALSE;
+    out[0] = '\0';
+    iter->move(iter, 0, UITER_ZERO);
+    if (!iter->hasNext(iter)) {   // Empty input string
+        return 0;
     }
-  }
+    UnicodeString ustr;
+    bool prefixIsNonNumeric = false;
+    bool prefixIsNumeric = false;
+    while (iter->hasNext(iter)) {
+        UChar32 ch = uiter_next32(iter);
+        // Ignore standard phone number separators and identify any string
+        // that otherwise starts with a number.
+        if (!prefixIsNumeric && !prefixIsNonNumeric) {
+            if (u_isdigit(ch)) {
+                prefixIsNumeric = true;
+            } else if (!u_isspace(ch) && ch != '+' && ch != '(' &&
+                       ch != ')' && ch != '.' && ch != '-' && ch != '#') {
+                prefixIsNonNumeric = true;
+            }
+        }
+        ustr.append(ch);
+    }
+    if (prefixIsNumeric) {
+        out[0] = '#';
+        return 1;
+    }
 
-  out[0] = c;
-  return 1;
+    Mutex::Autolock autolock(gIndexMutex);
+    if (!initIndexForLocale(locale)) {
+        *isError = TRUE;
+        return 0;
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    int32_t bucketIndex = (*gIndex)->getBucketIndex(ustr, status);
+    if (U_FAILURE(status)) {
+        *isError = TRUE;
+        return 0;
+    }
+
+    int32_t len = gIndex->getLabel(bucketIndex, out, size);
+    if (len < 0) {
+        *isError = TRUE;
+        return 0;
+    }
+
+    // For Japanese, label unclassified CJK ideographs with
+    // Japanese word meaning "misc" or "other"
+    if (gIndex->isJapanese() && len == 0 && is_CJ(ustr.char32At(0))) {
+        out[0] = 0x4ED6;
+        len = 1;
+    }
+
+    return len;
 }
 
 }  // namespace android
diff --git a/android/PhonebookIndexTest.cpp b/android/PhonebookIndexTest.cpp
new file mode 100644
index 0000000..2f11dbe
--- /dev/null
+++ b/android/PhonebookIndexTest.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PhonebookIndex.h"
+
+#include <unicode/unistr.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+using namespace android;
+
+class TestExecutor {
+public:
+    TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {}
+    bool DoAllTests();
+private:
+    void DoOneTest(void (TestExecutor::*test)());
+
+    void testGetIndex(const char *src, const char *locale,
+                      int32_t expected_len, UChar *expected_value);
+    void testEnglish();
+
+    // Note: When adding a test, do not forget to add it to DoOneTest().
+
+    int m_total_count;
+    int m_success_count;
+
+    bool m_success;
+};
+
+
+bool TestExecutor::DoAllTests() {
+    DoOneTest(&TestExecutor::testEnglish);
+
+    printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
+           m_total_count, m_success_count, m_total_count - m_success_count);
+
+    bool success = m_total_count == m_success_count;
+    printf("\n%s\n", success ? "Success" : "Failure");
+
+    return success;
+}
+
+void TestExecutor::DoOneTest(void (TestExecutor::*test)()) {
+    m_success = true;
+
+    (this->*test)();
+
+    ++m_total_count;
+    m_success_count += m_success ? 1 : 0;
+}
+
+#define BUFFER_SIZE 10
+
+static void printUTF8Str(const char *utf8_str) {
+    printf("%s (", utf8_str);
+    for(; *utf8_str != '\0'; ++utf8_str) {
+        printf("\\x%02hhX", *utf8_str);
+    }
+    printf(")");
+}
+
+static void printUChars(const UChar *uc_str, int32_t len) {
+    std::string utf8_str;
+    UnicodeString(uc_str, len).toUTF8String(utf8_str);
+    printf("%s (", utf8_str.c_str());
+    for(int i=0; i<len; ++i) {
+        printf("0x%02hx%s", uc_str[i], i < (len - 1) ? " " : "");
+    }
+    printf(")");
+}
+
+void TestExecutor::testGetIndex(
+    const char *src, const char *locale,
+    int32_t expected_len, UChar *expected_value) {
+    UBool isError;
+
+    UCharIterator iter;
+    uiter_setUTF8(&iter, src, -1);
+
+    UChar outBuf[BUFFER_SIZE];
+
+    int32_t len = GetPhonebookIndex(&iter, locale, outBuf, sizeof(outBuf), &isError);
+    if (isError) {
+        printf("GetPhonebookIndex returned error (%s:%s)\n", locale, src);
+        m_success = false;
+    } else if (len != expected_len) {
+        printf("len is unexpected value (src: [%s] %s, ", locale, src);
+        printf("actual: %u (", len);
+        printUChars(outBuf, len);
+        printf("), expected: %u (", expected_len);
+        printUChars(expected_value, expected_len);
+        printf("))\n");
+        m_success = false;
+    } else {
+        printf("[%s] %s: ", locale, src);
+        printUChars(outBuf, len);
+
+        if (memcmp(outBuf, expected_value, sizeof(UChar)*expected_len) != 0) {
+            printf(", expected ");
+            printUChars(expected_value, expected_len);
+            m_success = false;
+        }
+        printf("\n");
+    }
+}
+
+#define TEST_GET_UTF8STR_INDEX(src, locale, ...)                \
+    ({                                                          \
+        UChar uc_expected[] = {__VA_ARGS__};                    \
+        int32_t len = sizeof(uc_expected)/sizeof(UChar);        \
+        testGetIndex((src), (locale), len, uc_expected);        \
+    })
+
+#define TEST_GET_UCHAR_INDEX(src, locale, ...)                           \
+    ({                                                                   \
+        std::string utf8_str;                                            \
+        UnicodeString((UChar) (src)).toUTF8String(utf8_str);             \
+        TEST_GET_UTF8STR_INDEX(utf8_str.c_str(), (locale), __VA_ARGS__); \
+    })
+
+void TestExecutor::testEnglish() {
+    printf("testEnglish()\n");
+
+    // English [A-Z]
+    TEST_GET_UTF8STR_INDEX("Allen", "en", 'A');
+    TEST_GET_UTF8STR_INDEX("allen", "en", 'A');
+    TEST_GET_UTF8STR_INDEX("123456", "en", '#');
+    TEST_GET_UTF8STR_INDEX("+1 (123) 456-7890", "en", '#');
+    TEST_GET_UTF8STR_INDEX("(33) 44.55.66.08", "en", '#');
+    TEST_GET_UTF8STR_INDEX("123 Jump", "en", '#');
+    // Arabic numbers
+    TEST_GET_UTF8STR_INDEX("\u0662\u0663\u0664\u0665\u0666", "en", '#');
+
+    // Japanese
+    //   sorts hiragana/katakana, Kanji/Chinese, English, other
+    // …, あ, か, さ, た, な, は, ま, や, ら, わ, …
+    // hiragana "a"
+    TEST_GET_UCHAR_INDEX(0x3041, "ja", 0x3042);
+    // katakana "a"
+    TEST_GET_UCHAR_INDEX(0x30A1, "ja", 0x3042);
+
+    // Kanji (sorts to inflow section)
+    TEST_GET_UCHAR_INDEX(0x65E5, "ja", 0x4ed6);
+    // English
+    TEST_GET_UTF8STR_INDEX("Smith", "ja", 'S');
+    TEST_GET_UTF8STR_INDEX("234567", "ja", '#');
+    // Chinese (sorts to inflow section)
+    TEST_GET_UCHAR_INDEX(0x6c88 /* Shen/Chen */, "ja", 0x4ed6);
+    // Korean Hangul (sorts to overflow section)
+    TEST_GET_UCHAR_INDEX(0x1100, "ja", /* null */ );
+
+    // Korean (sorts Korean, then English)
+    // …, ᄀ, ᄂ, ᄃ, ᄅ, ᄆ, ᄇ, ᄉ, ᄋ, ᄌ, ᄎ, ᄏ, ᄐ, ᄑ, ᄒ, …
+    TEST_GET_UCHAR_INDEX(0x1100, "ko", 0x1100);
+    TEST_GET_UCHAR_INDEX(0x3131, "ko", 0x1100);
+    TEST_GET_UCHAR_INDEX(0x1101, "ko", 0x1100);
+    TEST_GET_UCHAR_INDEX(0x1161, "ko", 0x1112);
+
+    // Czech
+    // …, [A-C], Č,[D-H], CH, [I-R], Ř, S, Š, [T-Z], Ž, …
+    TEST_GET_UTF8STR_INDEX("Cena", "cs", 'C');
+    TEST_GET_UTF8STR_INDEX("Čáp", "cs", 0x010c);
+    TEST_GET_UTF8STR_INDEX("Ruda", "cs", 'R');
+    TEST_GET_UTF8STR_INDEX("Řada", "cs", 0x0158);
+    TEST_GET_UTF8STR_INDEX("Selka", "cs", 'S');
+    TEST_GET_UTF8STR_INDEX("Šála", "cs", 0x0160);
+    TEST_GET_UTF8STR_INDEX("Zebra", "cs", 'Z');
+    TEST_GET_UTF8STR_INDEX("Žába", "cs", 0x017d);
+    TEST_GET_UTF8STR_INDEX("Chata", "cs", 'C', 'H');
+
+    // French: [A-Z] (no accented chars)
+    TEST_GET_UTF8STR_INDEX("Øfer", "fr", 'O');
+    TEST_GET_UTF8STR_INDEX("Œster", "fr", 'O');
+
+    // Danish: [A-Z], Æ, Ø, Å
+    TEST_GET_UTF8STR_INDEX("Ænes", "da", 0xc6);
+    TEST_GET_UTF8STR_INDEX("Øfer", "da", 0xd8);
+    TEST_GET_UTF8STR_INDEX("Œster", "da", 0xd8);
+    TEST_GET_UTF8STR_INDEX("Ågård", "da", 0xc5);
+
+    // German: [A-Z] (no ß or umlauted characters in standard alphabet)
+    TEST_GET_UTF8STR_INDEX("ßind", "de", 'S');
+
+    // Simplified Chinese (default collator Pinyin): [A-Z]
+    // Shen/Chen (simplified): should be, usually, 'S' for name collator and 'C' for apps/other
+    TEST_GET_UCHAR_INDEX(0x6c88 /* Shen/Chen */, "zh_CN", 'C');
+    // Shen/Chen (traditional)
+    TEST_GET_UCHAR_INDEX(0x700b, "zh_CN", 'S');
+    // Jia/Gu: should be, usually, 'J' for name collator and 'G' for apps/other
+    TEST_GET_UCHAR_INDEX(0x8d3e /* Jia/Gu */, "zh_CN", 'J');
+
+    // Traditional Chinese
+    // …, 一, 丁, 丈, 不, 且, 丞, 串, 並, 亭, 乘, 乾, 傀, 亂, 僎, 僵, 儐, 償, 叢, 儳, 嚴, 儷, 儻, 囌, 囑, 廳, …
+    TEST_GET_UCHAR_INDEX(0x6c88 /* Shen/Chen */, "zh_TW", 0x5080);
+    TEST_GET_UCHAR_INDEX(0x700b /* Shen/Chen */, "zh_TW", 0x53e2);
+    TEST_GET_UCHAR_INDEX(0x8d3e /* Jia/Gu */, "zh_TW", 0x5080);
+
+    // Thai (sorts English then Thai)
+    // …, ก, ข, ฃ, ค, ฅ, ฆ, ง, จ, ฉ, ช, ซ, ฌ, ญ, ฎ, ฏ, ฐ, ฑ, ฒ, ณ, ด, ต, ถ, ท, ธ, น, บ, ป, ผ, ฝ, พ, ฟ, ภ, ม, ย, ร, ฤ, ล, ฦ, ว, ศ, ษ, ส, ห, ฬ, อ, ฮ, …,
+
+    TEST_GET_UTF8STR_INDEX("\u0e2d\u0e07\u0e04\u0e4c\u0e40\u0e25\u0e47\u0e01",
+                           "th", 0xe2d);
+    TEST_GET_UTF8STR_INDEX("\u0e2a\u0e34\u0e07\u0e2b\u0e40\u0e2a\u0e19\u0e35",
+                           "th", 0xe2a);
+    // Thai numbers ((02) 432-0281)
+    TEST_GET_UTF8STR_INDEX("(\u0e50\u0e52) \u0e54\u0e53\u0e52-"
+                           "\u0e50\u0e52\u0e58\u0e51", "th", '#');
+
+    // Arabic (sorts English then Arabic)
+    // …, ا, ب, ت, ث, ج, ح, خ, د, ذ, ر, ز, س, ش, ص, ض, ط, ظ, ع, غ, ف, ق, ك, ل, م, ن, ه, و, ي, …
+    TEST_GET_UTF8STR_INDEX("\u0646\u0648\u0631" /* Noor */, "ar", 0x646);
+    // Arabic numbers (34567)
+    TEST_GET_UTF8STR_INDEX("\u0662\u0663\u0664\u0665\u0666", "ar", '#');
+
+    // Hebrew (sorts English then Hebrew)
+    // …, א, ב, ג, ד, ה, ו, ז, ח, ט, י, כ, ל, מ, נ, ס, ע, פ, צ, ק, ר, ש, ת, …
+    TEST_GET_UTF8STR_INDEX("\u05e4\u05e8\u05d9\u05d3\u05de\u05df",  "he", 0x5e4);
+}
+
+int main() {
+    TestExecutor executor;
+    if(executor.DoAllTests()) {
+        return 0;
+    } else {
+        return 1;
+    }
+}
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
deleted file mode 100644
index 796eaa2..0000000
--- a/android/PhoneticStringUtils.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "PhoneticStringUtils.h"
-#include <utils/Unicode.h>
-
-// We'd like 0 length string last of sorted list. So when input string is NULL
-// or 0 length string, we use these instead.
-#define CODEPOINT_FOR_NULL_STR 0xFFFD
-#define STR_FOR_NULL_STR "\xEF\xBF\xBD"
-
-// We assume that users will not notice strings not sorted properly when the
-// first 128 characters are the same.
-#define MAX_CODEPOINTS 128
-
-namespace android {
-
-// Get hiragana from halfwidth katakana.
-static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint,
-                                            char32_t next_codepoint,
-                                            bool *next_is_consumed) {
-    if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
-        return codepoint;
-    }
-
-    switch (codepoint) {
-        case 0xFF66: // wo
-            return 0x3092;
-        case 0xFF67: // xa
-            return 0x3041;
-        case 0xFF68: // xi
-            return 0x3043;
-        case 0xFF69: // xu
-            return 0x3045;
-        case 0xFF6A: // xe
-            return 0x3047;
-        case 0xFF6B: // xo
-            return 0x3049;
-        case 0xFF6C: // xya
-            return 0x3083;
-        case 0xFF6D: // xyu
-            return 0x3085;
-        case 0xFF6E: // xyo
-            return 0x3087;
-        case 0xFF6F: // xtsu
-            return 0x3063;
-        case 0xFF70: // -
-            return 0x30FC;
-        case 0xFF9C: // wa
-            return 0x308F;
-        case 0xFF9D: // n
-            return 0x3093;
-            break;
-        default:   {
-            if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
-                // a, i, u, e, o
-                if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
-                    if (next_is_consumed != NULL) {
-                        *next_is_consumed = true;
-                    }
-                    return 0x3094; // vu
-                } else {
-                    return 0x3042 + (codepoint - 0xFF71) * 2;
-                }
-            } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
-                // ka - chi
-                if (next_codepoint == 0xFF9E) {
-                    // "dakuten" (voiced mark)
-                    if (next_is_consumed != NULL) {
-                        *next_is_consumed = true;
-                    }
-                    return 0x304B + (codepoint - 0xFF76) * 2 + 1;
-                } else {
-                    return 0x304B + (codepoint - 0xFF76) * 2;
-                }
-            } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
-                // tsu, te, to (skip xtsu)
-                if (next_codepoint == 0xFF9E) {
-                    // "dakuten" (voiced mark)
-                    if (next_is_consumed != NULL) {
-                        *next_is_consumed = true;
-                    }
-                    return 0x3064 + (codepoint - 0xFF82) * 2 + 1;
-                } else {
-                    return 0x3064 + (codepoint - 0xFF82) * 2;
-                }
-            } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
-                // na, ni, nu, ne, no
-                return 0x306A + (codepoint - 0xFF85);
-            } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
-                // ha, hi, hu, he, ho
-                if (next_codepoint == 0xFF9E) {
-                    // "dakuten" (voiced mark)
-                    if (next_is_consumed != NULL) {
-                        *next_is_consumed = true;
-                    }
-                    return 0x306F + (codepoint - 0xFF8A) * 3 + 1;
-                } else if (next_codepoint == 0xFF9F) {
-                    // "han-dakuten" (half voiced mark)
-                    if (next_is_consumed != NULL) {
-                        *next_is_consumed = true;
-                    }
-                    return 0x306F + (codepoint - 0xFF8A) * 3 + 2;
-                } else {
-                    return 0x306F + (codepoint - 0xFF8A) * 3;
-                }
-            } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
-                // ma, mi, mu, me, mo
-                return 0x307E + (codepoint - 0xFF8F);
-            } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
-                // ya, yu, yo
-                return 0x3084 + (codepoint - 0xFF94) * 2;
-            } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
-                // ra, ri, ru, re, ro
-                return 0x3089 + (codepoint - 0xFF97);
-            }
-            // Note: 0xFF9C, 0xFF9D are handled above
-        } // end of default
-    }
-
-    return codepoint;
-}
-
-// Assuming input is hiragana, convert the hiragana to "normalized" hiragana.
-static int GetNormalizedHiragana(int codepoint) {
-    if (codepoint < 0x3040 || 0x309F < codepoint) {
-        return codepoint;
-    }
-
-    // TODO: should care (semi-)voiced mark (0x3099, 0x309A).
-
-    // Trivial kana conversions.
-    // e.g. xa => a
-    switch (codepoint) {
-        case 0x3041:
-        case 0x3043:
-        case 0x3045:
-        case 0x3047:
-        case 0x3049:
-        case 0x3063:
-        case 0x3083:
-        case 0x3085:
-        case 0x3087:
-        case 0x308E: // xwa
-            return codepoint + 1;
-        case 0x3095: // xka
-            return 0x304B;
-        case 0x3096: // xke
-            return 0x3051;
-        case 0x31F0: // xku
-            return 0x304F;
-        case 0x31F1: // xsi
-            return 0x3057;
-        case 0x31F2: // xsu
-            return 0x3059;
-        case 0x31F3: // xto
-            return 0x3068;
-        case 0x31F4: // xnu
-            return 0x306C;
-        case 0x31F5: // xha
-            return 0x306F;
-        case 0x31F6: // xhi
-            return 0x3072;
-        case 0x31F7: // xhu
-            return 0x3075;
-        case 0x31F8: // xhe
-            return 0x3078;
-        case 0x31F9: // xho
-            return 0x307B;
-        case 0x31FA: // xmu
-            return 0x3080;
-        case 0x31FB: // xra
-        case 0x31FC: // xri
-        case 0x31FD: // xru
-        case 0x31FE: // xre
-        case 0x31FF: // xro
-            // ra: 0x3089
-            return 0x3089 + (codepoint - 0x31FB);
-        default:
-            return codepoint;
-    }
-}
-
-static int GetNormalizedKana(char32_t codepoint,
-                             char32_t next_codepoint,
-                             bool *next_is_consumed) {
-    // First, convert fullwidth katakana and halfwidth katakana to hiragana.
-    if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
-        // Make fullwidth katakana same as hiragana.
-        // 96 == 0x30A1 - 0x3041c
-        codepoint = codepoint - 96;
-    } else if (codepoint == 0x309F) {
-        // Digraph YORI; Yo
-        codepoint = 0x3088;
-    } else if (codepoint == 0x30FF) {
-        // Digraph KOTO; Ko
-        codepoint = 0x3053;
-    } else {
-        codepoint = GetHiraganaFromHalfwidthKatakana(
-                codepoint, next_codepoint, next_is_consumed);
-    }
-
-    // Normalize Hiragana.
-    return GetNormalizedHiragana(codepoint);
-}
-
-int GetNormalizedCodePoint(char32_t codepoint,
-                           char32_t next_codepoint,
-                           bool *next_is_consumed) {
-    if (next_is_consumed != NULL) {
-        *next_is_consumed = false;
-    }
-
-    if (codepoint <= 0x0020 || codepoint == 0x3000) {
-        // Whitespaces. Keep it as is.
-        return codepoint;
-    } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
-               (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
-        // Ascii and fullwidth ascii. Keep it as is
-        return codepoint;
-    } else if (codepoint == 0x02DC || codepoint == 0x223C) {
-        // tilde
-        return 0xFF5E;
-    } else if (codepoint <= 0x3040 ||
-               (0x3100 <= codepoint && codepoint < 0xFF00) ||
-               codepoint == CODEPOINT_FOR_NULL_STR) {
-        // Keep it as is.
-        return codepoint;
-    }
-
-    // Below is Kana-related handling.
-
-    return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
-}
-
-static bool GetExpectedString(
-    const char *src, char **dst, size_t *dst_len,
-    int (*get_codepoint_function)(char32_t, char32_t, bool*)) {
-    if (dst == NULL || dst_len == NULL) {
-        return false;
-    }
-
-    if (src == NULL || *src == '\0') {
-        src = STR_FOR_NULL_STR;
-    }
-
-    char32_t codepoints[MAX_CODEPOINTS]; // if array size is changed the for loop needs to be changed
-
-    ssize_t src_len = utf8_length(src);
-    if (src_len <= 0) {
-        return false;
-    }
-
-    bool next_is_consumed;
-    size_t j = 0;
-    for (size_t i = 0; i < (size_t)src_len && j < MAX_CODEPOINTS;) {
-        int32_t ret = utf32_from_utf8_at(src, src_len, i, &i);
-        if (ret < 0) {
-            // failed to parse UTF-8
-            return false;
-        }
-        ret = get_codepoint_function(
-                static_cast<char32_t>(ret),
-                i + 1 < (size_t)src_len ? src[i + 1] : 0,
-                &next_is_consumed);
-        if (ret > 0) {
-            codepoints[j] = static_cast<char32_t>(ret);
-            j++;
-        }
-        if (next_is_consumed) {
-            i++;
-        }
-    }
-    size_t length = j;
-
-    if (length == 0) {
-        // If all of codepoints are invalid, we place the string at the end of
-        // the list.
-        codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
-        length = 1;
-    }
-
-    ssize_t new_len = utf32_to_utf8_length(codepoints, length);
-    if (new_len < 0) {
-        return false;
-    }
-
-    *dst = static_cast<char *>(malloc(new_len + 1));
-    if (*dst == NULL) {
-        return false;
-    }
-
-    utf32_to_utf8(codepoints, length, *dst);
-
-    *dst_len = new_len;
-    return true;
-}
-
-bool GetNormalizedString(const char *src, char **dst, size_t *len) {
-    return GetExpectedString(src, dst, len, GetNormalizedCodePoint);
-}
-
-}  // namespace android
diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h
deleted file mode 100644
index a567a27..0000000
--- a/android/PhoneticStringUtils.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _ANDROID_PHONETIC_STRING_UTILS_H
-#define _ANDROID_PHONETIC_STRING_UTILS_H
-
-#include <string.h>  // For size_t.
-#include <utils/String8.h>
-
-namespace android {
-
-// Returns codepoint which is "normalized", whose definition depends on each
-// Locale. Note that currently this function normalizes only Japanese; the
-// other characters are remained as is.
-// The variable "next_is_consumed" is set to true if "next_codepoint"
-// is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
-// when previous "codepoint" is appropriate, like half-width "ka").
-//
-// In Japanese, "normalized" means that half-width and full-width katakana is
-// appropriately converted to hiragana.
-int GetNormalizedCodePoint(char32_t codepoint,
-                           char32_t next_codepoint,
-                           bool *next_is_consumed);
-
-// Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful.
-// If input is invalid or the length of the destination is not enough,
-// returns false.
-bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index);
-
-// Creates a "phonetically sortable" Utf8 string and push it into "dst".
-// *dst must be freed after being used outside.
-// If "src" is NULL or its length is 0, "dst" is set to \uFFFF.
-//
-// Note that currently this function considers only Japanese.
-bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len);
-
-// Creates a "normalized" Utf8 string and push it into "dst". *dst must be
-// freed after being used outside.
-// If "src" is NULL or its length is 0, "dst" is set to \uFFFF.
-//
-// Note that currently this function considers only Japanese.
-bool GetNormalizedString(const char *src, char **dst, size_t *len);
-
-}  // namespace android
-
-#endif
diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp
deleted file mode 100644
index 9885823..0000000
--- a/android/PhoneticStringUtilsTest.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "PhoneticStringUtils.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <utils/String8.h>
-
-using namespace android;
-
-class TestExecutor {
- public:
-  TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {}
-  bool DoAllTests();
- private:
-  void DoOneTest(void (TestExecutor::*test)());
-
-  void testUtf32At();
-  void testGetUtf8FromUtf32();
-  void testGetNormalizedString();
-  void testLongString();
-
-  // Note: When adding a test, do not forget to add it to DoOneTest().
-
-  int m_total_count;
-  int m_success_count;
-
-  bool m_success;
-};
-
-#define ASSERT_EQ_VALUE(input, expected)                                \
-  ({                                                                    \
-    if ((expected) != (input)) {                                        \
-      printf("0x%X(result) != 0x%X(expected)\n", input, expected);      \
-      m_success = false;                                                \
-      return;                                                           \
-    }                                                                   \
-  })
-
-#define EXPECT_EQ_VALUE(input, expected)                                \
-  ({                                                                    \
-    if ((expected) != (input)) {                                        \
-      printf("0x%X(result) != 0x%X(expected)\n", input, expected);      \
-      m_success = false;                                                \
-    }                                                                   \
-  })
-
-
-bool TestExecutor::DoAllTests() {
-  DoOneTest(&TestExecutor::testUtf32At);
-  DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
-  DoOneTest(&TestExecutor::testGetNormalizedString);
-  DoOneTest(&TestExecutor::testLongString);
-
-  printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
-         m_total_count, m_success_count, m_total_count - m_success_count);
-
-  bool success = m_total_count == m_success_count;
-  printf("\n%s\n", success ? "Success" : "Failure");
-
-  return success;
-}
-
-void TestExecutor::DoOneTest(void (TestExecutor::*test)()) {
-  m_success = true;
-
-  (this->*test)();
-
-  ++m_total_count;
-  m_success_count += m_success ? 1 : 0;
-}
-
-#define TEST_GET_UTF32AT(src, index, expected_next, expected_value)     \
-  ({                                                                    \
-    size_t next;                                                        \
-    int32_t ret = utf32_from_utf8_at(src, strlen(src), index, &next);   \
-    if (ret < 0) {                                                      \
-      printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
-             (src), (index));                                           \
-      m_success = false;                                                \
-    } else if (next != (expected_next)) {                               \
-      printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
-             (src), next, (expected_next));                             \
-    } else {                                                            \
-      EXPECT_EQ_VALUE(ret, (expected_value));                           \
-    }                                                                   \
-   })
-
-void TestExecutor::testUtf32At() {
-  printf("testUtf32At()\n");
-
-  TEST_GET_UTF32AT("a", 0, 1, 97);
-  // Japanese hiragana "a"
-  TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
-  // Japanese fullwidth katakana "a" with ascii a
-  TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
-
-  // 2 PUA
-  TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
-  TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
-}
-
-
-#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected)                   \
-  ({                                                                    \
-    char32_t codepoints[1] = {codepoint};                                \
-    status_t ret = string8.setTo(codepoints, 1);                        \
-    if (ret != NO_ERROR) {                                              \
-      printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
-      m_success = false;                                                \
-    } else {                                                            \
-      const char* string = string8.string();                            \
-      if (strcmp(string, expected) != 0) {                              \
-        printf("Failed at codepoint 0x%04X\n", codepoint);              \
-        for (const char *ch = string; *ch != '\0'; ++ch) {              \
-          printf("0x%X ", *ch);                                         \
-        }                                                               \
-        printf("!= ");                                                  \
-        for (const char *ch = expected; *ch != '\0'; ++ch) {            \
-          printf("0x%X ", *ch);                                         \
-        }                                                               \
-        printf("\n");                                                   \
-        m_success = false;                                              \
-      }                                                                 \
-    }                                                                   \
-  })
-
-void TestExecutor::testGetUtf8FromUtf32() {
-  printf("testGetUtf8FromUtf32()\n");
-  String8 string8;
-
-  EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
-  // Armenian capital letter AYB (2 bytes in UTF8)
-  EXPECT_EQ_CODEPOINT_UTF8(0x0530, "\xD4\xB0");
-  // Japanese 'a' (3 bytes in UTF8)
-  EXPECT_EQ_CODEPOINT_UTF8(0x3042, "\xE3\x81\x82");
-  // Kanji
-  EXPECT_EQ_CODEPOINT_UTF8(0x65E5, "\xE6\x97\xA5");
-  // PUA (4 byets in UTF8)
-  EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
-  EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
-}
-
-#define EXPECT_EQ_UTF8_UTF8(src, expected)                              \
-  ({                                                                    \
-    if (!GetNormalizedString(src, &dst, &len)) {                        \
-      printf("GetNormalizedSortableString() returned false.\n");      \
-      m_success = false;                                                \
-    } else {                                                            \
-      if (strcmp(dst, expected) != 0) {                                 \
-        for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
-          printf("0x%X ", *ch);                                         \
-        }                                                               \
-        printf("!= ");                                                  \
-        for (const char *ch = expected; *ch != '\0'; ++ch) {            \
-          printf("0x%X ", *ch);                                         \
-        }                                                               \
-        printf("\n");                                                   \
-        m_success = false;                                              \
-      }                                                                 \
-      free(dst);                                                        \
-    }                                                                   \
-   })
-
-void TestExecutor::testGetNormalizedString() {
-  printf("testGetNormalizedString()\n");
-  char *dst;
-  size_t len;
-
-  // halfwidth alphabets/symbols -> keep it as is.
-  EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()",
-                      "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()");
-  EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/",
-                      "abcdefghijklmnopqrstuvwxyz[]{}\\@/");
-
-  // halfwidth/fullwidth-katakana -> hiragana
-  EXPECT_EQ_UTF8_UTF8(
-      "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
-      "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
-
-  // whitespace -> keep it as is.
-  EXPECT_EQ_UTF8_UTF8("    \t", "    \t");
-}
-
-void TestExecutor::testLongString() {
-  printf("testLongString()\n");
-  char * dst;
-  size_t len;
-  EXPECT_EQ_UTF8_UTF8("Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttgggggggggggggggggggggggggggggggggggggggbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
-      "Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttggggggggggggggggggggggggggggggggggg");
-}
-
-
-int main() {
-  TestExecutor executor;
-  if(executor.DoAllTests()) {
-    return 0;
-  } else {
-    return 1;
-  }
-}
diff --git a/android/sqlite3_android.cpp b/android/sqlite3_android.cpp
index 5daf15e..fe826fd 100644
--- a/android/sqlite3_android.cpp
+++ b/android/sqlite3_android.cpp
@@ -509,14 +509,8 @@
 
 
     //// PHONEBOOK_COLLATOR
-    // The collator may be removed in the near future. Do not depend on it.
-    // TODO: it might be better to have another function for registering phonebook collator.
     status = U_ZERO_ERROR;
-    if (strcmp(systemLocale, "ja") == 0 || strcmp(systemLocale, "ja_JP") == 0) {
-        collator = ucol_open("ja@collation=phonebook", &status);
-    } else {
-        collator = ucol_open(systemLocale, &status);
-    }
+    collator = ucol_open(systemLocale, &status);
     if (U_FAILURE(status)) {
         return -1;
     }