Merge commit 'goog/readonly-korg-master' into merge_korg_master
diff --git a/android/Android.mk b/android/Android.mk
index 9a6efc7..44d77b6 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -11,7 +11,6 @@
         external/icu4c/i18n \
         external/icu4c/common
 
-
 LOCAL_MODULE:= libsqlite3_android
 
 include $(BUILD_STATIC_LIBRARY)
@@ -27,6 +26,33 @@
 	PhoneticStringUtils.cpp \
 	PhoneticStringUtilsTest.cpp
 
-LOCAL_MODULE_TAGS := tests optional
+LOCAL_MODULE_TAGS := optional
+
+LOCAL_SHARED_LIBRARIES := \
+	libutils
+
+include $(BUILD_EXECUTABLE)
+
+# Test for PhoneNumberUtils
+#
+# You can also test this in Unix, like this:
+# > g++ -Wall external/sqlite/android/PhoneNumberUtils.cpp \
+#   external/sqlite/android/PhoneNumberUtilsTest.cpp
+# > ./a.out
+#
+# Note: tests related to PHONE_NUMBERS_EQUAL also exists in AndroidTests in
+# java space. Add tests if you modify this.
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= libsqlite3_phone_number_utils_test
+
+LOCAL_CFLAGS += -Wall -Werror
+
+LOCAL_SRC_FILES := \
+	PhoneNumberUtils.cpp \
+	PhoneNumberUtilsTest.cpp
+
+LOCAL_MODULE_TAGS := optional
 
 include $(BUILD_EXECUTABLE)
diff --git a/android/PhoneNumberUtils.cpp b/android/PhoneNumberUtils.cpp
index 9e5e470..cb8552e 100644
--- a/android/PhoneNumberUtils.cpp
+++ b/android/PhoneNumberUtils.cpp
@@ -1,293 +1,383 @@
-/* //device/vmlibs-android/com.android.internal.telephony/PhoneNumberUtils.java
-**
-** Copyright 2006, The Android Open Source Project
-**
-** Licensed under the Apache License, Version 2.0 (the "License"); 
-** you may not use this file except in compliance with the License. 
-** You may obtain a copy of the License at 
-**
-**     http://www.apache.org/licenses/LICENSE-2.0 
-**
-** Unless required by applicable law or agreed to in writing, software 
-** distributed under the License is distributed on an "AS IS" BASIS, 
-** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
-** See the License for the specific language governing permissions and 
-** limitations under the License.
-*/
+/*
+ * Copyright 2009, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <string.h>
 
 namespace android {
 
-static int MIN_MATCH = 5;
+/* Generated by the following Python script. Values of country calling codes
+   are from http://en.wikipedia.org/wiki/List_of_country_calling_codes
 
-/** True if c is ISO-LATIN characters 0-9 */
-static bool isISODigit (char c)
-{
-    return c >= '0' && c <= '9';
-}
+#!/usr/bin/python
+import sys
+ccc_set_2digits = set([0, 1, 7,
+                       20, 27, 28, 30, 31, 32, 33, 34, 36, 39, 40, 43, 44, 45,
+                       46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61,
+                       62, 63, 64, 65, 66, 81, 82, 83, 84, 86, 89, 90, 91, 92,
+                       93, 94, 95, 98])
 
-/** True if c is ISO-LATIN characters 0-9, *, # , +  */
-static bool isNonSeparator(char c)
-{
-    return (c >= '0' && c <= '9') || c == '*' || c == '#' || c == '+';
+ONE_LINE_NUM = 10
+
+for i in xrange(100):
+  if i % ONE_LINE_NUM == 0:
+    sys.stdout.write('    ')
+  if i in ccc_set_2digits:
+    included = 'true'
+  else:
+    included = 'false'
+  sys.stdout.write(included + ',')
+  if ((i + 1) % ONE_LINE_NUM) == 0:
+    sys.stdout.write('\n')
+  else:
+    sys.stdout.write(' ')
+*/
+static bool two_length_country_code_map[100] = {
+    true, true, false, false, false, false, false, true, false, false,
+    false, false, false, false, false, false, false, false, false, false,
+    true, false, false, false, false, false, false, true, true, false,
+    true, true, true, true, true, false, true, false, false, true,
+    true, false, false, true, true, true, true, true, true, true,
+    false, true, true, true, true, true, true, true, true, false,
+    true, true, true, true, true, true, true, false, false, false,
+    false, false, false, false, false, false, false, false, false, false,
+    false, true, true, true, true, false, true, false, false, true,
+    true, true, true, true, true, true, false, false, true, false,
+};
+
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof((a)[0]))
+
+/**
+ * Returns true if "ccc_candidate" expresses (part of ) some country calling
+ * code.
+ * Returns false otherwise.
+ */
+static bool isCountryCallingCode(int ccc_candidate) {
+    return ccc_candidate > 0 &&
+            ccc_candidate < (int)ARRAY_SIZE(two_length_country_code_map) &&
+            two_length_country_code_map[ccc_candidate];
 }
 
 /**
- * Phone numbers are stored in "lookup" form in the database
- * as reversed strings to allow for caller ID lookup
- *
- * This method takes a phone number and makes a valid SQL "LIKE"
- * string that will match the lookup form 
- *
+ * Returns interger corresponding to the input if input "ch" is
+ * ISO-LATIN characters 0-9.
+ * Returns -1 otherwise
  */
-/** all of a up to len must be an international prefix or 
- *  separators/non-dialing digits
- */
-static bool matchIntlPrefix(const char* a, int len) 
+static int tryGetISODigit (char ch)
 {
-    /* '([^0-9*#+]\+[^0-9*#+] | [^0-9*#+]0(0|11)[^0-9*#+] )$' */
-    /*        0    1                     2 3 45               */ 
-
-    int state = 0;
-    for (int i = 0 ; i < len ; i++) {
-        char c = a[i];
-
-        switch (state) {
-            case 0: 
-                if      (c == '+') state = 1;
-                else if (c == '0') state = 2;
-                else if (isNonSeparator(c)) return false;
-            break;
-
-            case 2:
-                if      (c == '0') state = 3;
-                else if (c == '1') state = 4;
-                else if (isNonSeparator(c)) return false;
-            break;
-
-            case 4:
-                if      (c == '1') state = 5;
-                else if (isNonSeparator(c)) return false;
-            break;
-
-            default:
-                if (isNonSeparator(c)) return false;
-            break;
-
-        }
+    if ('0' <= ch && ch <= '9') {
+        return ch - '0';
+    } else {
+        return -1;
     }
-
-    return state == 1 || state == 3 || state == 5;
 }
 
-/** all of 'a' up to len must match non-US trunk prefix ('0') */
-static bool matchTrunkPrefix(const char* a, int len)
+/** True if c is ISO-LATIN characters 0-9, *, # , +  */
+static bool isNonSeparator(char ch)
 {
-    bool found;
+    return ('0' <= ch && ch <= '9') || ch == '*' || ch == '#' || ch == '+';
+}
 
-    found = false;
-
-    for (int i = 0 ; i < len ; i++) {
-        char c = a[i];
-
-        if (c == '0' && !found) {
-            found = true;
-        } else if (isNonSeparator(c)) {
+/**
+ * Try to store the pointer to "new_ptr" which does not have trunk prefix.
+ *
+ * Currently this function simply ignore the first digit assuming it is
+ * trunk prefix. Actually trunk prefix is different in each country.
+ *
+ * e.g.
+ * "+79161234567" equals "89161234567" (Russian trunk digit is 8)
+ * "+33123456789" equals "0123456789" (French trunk digit is 0)
+ *
+ */
+static bool tryGetTrunkPrefixOmittedStr(const char *str, size_t len,
+                                        const char **new_ptr, size_t *new_len)
+{
+    for (size_t i = 0 ; i < len ; i++) {
+        char ch = str[i];
+        if (tryGetISODigit(ch) >= 0) {
+            if (new_ptr != NULL) {
+                *new_ptr = str + i + 1;
+            }
+            if (new_len != NULL) {
+                *new_len = len - (i + 1);
+            }
+            return true;
+        } else if (isNonSeparator(ch)) {
             return false;
         }
     }
-    
-    return found;
+
+    return false;
 }
 
-/** all of 'a' up to len must be a (+|00|011)country code) 
- *  We're fast and loose with the country code. Any \d{1,3} matches */
-static bool matchIntlPrefixAndCC(const char* a, int len)
+/*
+ * Note that this function does not strictly care the country calling code with
+ * 3 length (like Morocco: +212), assuming it is enough to use the first two
+ * digit to compare two phone numbers.
+ */
+static int tryGetCountryCallingCode(const char *str, size_t len,
+                                    const char **new_ptr, size_t *new_len)
 {
-    /*  [^0-9*#+]*(\+|0(0|11)\d\d?\d? [^0-9*#+] $ */
-    /*      0       1 2 3 45  6 7  8              */
+    // Rough regexp:
+    //  ^[^0-9*#+]*((\+|0(0|11)\d\d?|166) [^0-9*#+] $
+    //         0        1 2 3 45  6 7  89
+    //
+    // In all the states, this function ignores separator characters.
+    // "166" is the special case for the call from Thailand to the US. Ugu!
 
     int state = 0;
-    for (int i = 0 ; i < len ; i++ ) {
-        char c = a[i];
-
+    int ccc = 0;
+    for (size_t i = 0 ; i < len ; i++ ) {
+        char ch = str[i];
         switch (state) {
             case 0:
-                if      (c == '+') state = 1;
-                else if (c == '0') state = 2;
-                else if (isNonSeparator(c)) return false;
+                if      (ch == '+') state = 1;
+                else if (ch == '0') state = 2;
+                else if (ch == '1') state = 8;
+                else if (isNonSeparator(ch)) return -1;
             break;
 
             case 2:
-                if      (c == '0') state = 3;
-                else if (c == '1') state = 4;
-                else if (isNonSeparator(c)) return false;
+                if      (ch == '0') state = 3;
+                else if (ch == '1') state = 4;
+                else if (isNonSeparator(ch)) return -1;
             break;
 
             case 4:
-                if      (c == '1') state = 5;
-                else if (isNonSeparator(c)) return false;
+                if      (ch == '1') state = 5;
+                else if (isNonSeparator(ch)) return -1;
             break;
 
             case 1:
             case 3:
             case 5:
-                if      (isISODigit(c)) state = 6;
-                else if (isNonSeparator(c)) return false;
-            break;
-
             case 6:
             case 7:
-                if      (isISODigit(c)) state++;
-                else if (isNonSeparator(c)) return false;
-            break;
-
+                {
+                    int ret = tryGetISODigit(ch);
+                    if (ret > 0) {
+                        ccc = ccc * 10 + ret;
+                        if (ccc >= 100 || isCountryCallingCode(ccc)) {
+                            if (new_ptr != NULL) {
+                                *new_ptr = str + i + 1;
+                            }
+                            if (new_len != NULL) {
+                                *new_len = len - (i + 1);
+                            }
+                            return ccc;
+                        }
+                        if (state == 1 || state == 3 || state == 5) {
+                            state = 6;
+                        } else {
+                            state++;
+                        }
+                    } else if (isNonSeparator(ch)) {
+                        return -1;
+                    }
+                }
+                break;
+            case 8:
+                if (ch == '6') state = 9;
+                else if (isNonSeparator(ch)) return -1;
+                break;
+            case 9:
+                if (ch == '6') {
+                    if (new_ptr != NULL) {
+                        *new_ptr = str + i + 1;
+                    }
+                    if (new_len != NULL) {
+                        *new_len = len - (i + 1);
+                    }
+                    return 66;
+                }
+                break;
             default:
-                if (isNonSeparator(c)) return false;
+                return -1;
         }
     }
 
-    return state == 6 || state == 7 || state == 8;
-}
-
-/** or -1 if both are negative */
-static int minPositive(int a, int b)
-{
-    if (a >= 0 && b >= 0) {
-        return (a < b) ? a : b; 
-    } else if (a >= 0) { /* && b < 0 */
-        return a;
-    } else if (b >= 0) { /* && a < 0 */
-        return b;
-    } else { /* a < 0 && b < 0 */ 
-        return -1;
-    }
+    return -1;
 }
 
 /**
- * Return the offset into a of the first appearance of b, or -1 if there
- * is no such character in a.
+ * Return true if the prefix of "ch" is "ignorable". Here, "ignorable" means
+ * that "ch" has only one digit and separater characters. The one digit is
+ * assumed to be trunk prefix.
  */
-static int indexOf(const char *a, char b) {
-    char *ix = strchr(a, b);
+static bool checkPrefixIsIgnorable(const char* ch, int i) {
+    bool trunk_prefix_was_read = false;
+    while (i >= 0) {
+        if (tryGetISODigit(ch[i]) >= 0) {
+            if (trunk_prefix_was_read) {
+                // More than one digit appeared, meaning that "a" and "b"
+                // is different.
+                return false;
+            } else {
+                // Ignore just one digit, assuming it is trunk prefix.
+                trunk_prefix_was_read = true;
+            }
+        } else if (isNonSeparator(ch[i])) {
+            // Trunk prefix is a digit, not "*", "#"...
+            return false;
+        }
+        i--;
+    }
 
-    if (ix == NULL)
-        return -1;
-    else
-        return ix - a;
+    return true;
 }
 
 /**
  * Compare phone numbers a and b, return true if they're identical
  * enough for caller ID purposes.
  *
- * - Compares from right to left
- * - requires MIN_MATCH (5) characters to match
- * - handles common trunk prefixes and international prefixes 
- *   (basically, everything except the Russian trunk prefix)
+ * Assume NULL as 0-length string.
  *
- * Tolerates nulls
+ * Detailed information:
+ * Currently (as of 2009-06-12), we cannot depend on the locale given from the
+ * OS. For example, current Android does not accept "en_JP", meaning
+ * "the display language is English but the phone should be in Japan", but
+ * en_US, es_US, etc. So we cannot identify which digit is valid trunk prefix
+ * in the country where the phone is used. More specifically, "880-1234-1234"
+ * is not valid phone number in Japan since the trunk prefix in Japan is not 8
+ * but 0 (correct number should be "080-1234-1234"), while Russian trunk prefix
+ * is 8. Also, we cannot know whether the country where users live has trunk
+ * prefix itself. So, we cannot determine whether "+81-80-1234-1234" is NOT
+ * same as "880-1234-1234" (while "+81-80-1234-1234" is same as "080-1234-1234"
+ * and we can determine "880-1234-1234" is different from "080-1234-1234").
+ *
+ * In the future, we should handle trunk prefix more correctly, but as of now,
+ * we just ignore it...
  */
 bool phone_number_compare(const char* a, const char* b)
 {
-    int ia, ib;
-    int matched;
-
-    if (a == NULL || b == NULL) {
-        return false; 
+    size_t len_a = 0;
+    size_t len_b = 0;
+    if (a == NULL) {
+        a = "";
+    } else {
+        len_a = strlen(a);
+    }
+    if (b == NULL) {
+        b = "";
+    } else {
+        len_b = strlen(b);
     }
 
-    ia = strlen(a);
-    ib = strlen(b);
-    if (ia == 0 || ib == 0) {
-        return false;
+    const char* tmp_a = NULL;
+    const char* tmp_b = NULL;
+    size_t tmp_len_a = len_a;
+    size_t tmp_len_b = len_b;
+
+    int ccc_a = tryGetCountryCallingCode(a, len_a, &tmp_a, &tmp_len_a);
+    int ccc_b = tryGetCountryCallingCode(b, len_b, &tmp_b, &tmp_len_b);
+    bool ok_to_ignore_prefix = true;
+    if (ccc_a >= 0 && ccc_b >= 0) {
+        if (ccc_a != ccc_b) {
+            // Different Country Calling Code. Must be different phone number.
+            return false;
+        }
+        // When both have ccc, do not ignore trunk prefix. Without this,
+        // "+81123123" becomes same as "+810123123" (+81 == Japan)
+        ok_to_ignore_prefix = false;
+    } else if (ccc_a < 0 && ccc_b < 0) {
+        // When both do not have ccc, do not ignore trunk prefix. Without this,
+        // "123123" becomes same as "0123123"
+        ok_to_ignore_prefix = false;
+    } else {
+        if (ccc_a < 0) {
+            tryGetTrunkPrefixOmittedStr(a, len_a, &tmp_a, &tmp_len_a);
+        }
+        if (ccc_b < 0) {
+            tryGetTrunkPrefixOmittedStr(b, len_b, &tmp_b, &tmp_len_b);
+        }
     }
 
-    // Compare from right to left
-    ia--;
-    ib--;
+    if (tmp_a != NULL) {
+        a = tmp_a;
+        len_a = tmp_len_a;
+    }
+    if (tmp_b != NULL) {
+        b = tmp_b;
+        len_b = tmp_len_b;
+    }
 
-    matched = 0;
-
-    while (ia >= 0 && ib >=0) {
-        char ca, cb;
-        bool skipCmp = false;
-
-        ca = a[ia];
-
-        if (!isNonSeparator(ca)) {
-            ia--;
-            skipCmp = true;
+    int i_a = len_a - 1;
+    int i_b = len_b - 1;
+    while (i_a >= 0 && i_b >= 0) {
+        bool skip_compare = false;
+        char ch_a = a[i_a];
+        char ch_b = b[i_b];
+        if (!isNonSeparator(ch_a)) {
+            i_a--;
+            skip_compare = true;
+        }
+        if (!isNonSeparator(ch_b)) {
+            i_b--;
+            skip_compare = true;
         }
 
-        cb = b[ib];
-
-        if (!isNonSeparator(cb)) {
-            ib--;
-            skipCmp = true;
-        }
-
-        if (!skipCmp) {
-            if (cb != ca) {
-                break;
+        if (!skip_compare) {
+            if (ch_a != ch_b) {
+                return false;
             }
-            ia--; ib--; matched++;
+            i_a--;
+            i_b--;
         }
     }
 
-    if (matched < MIN_MATCH) {
-        int aLen = strlen(a);
-        
-        // if the input strings match, but their lengths < MIN_MATCH, 
-        // treat them as equal.
-        if (aLen == (int)strlen(b) && aLen == matched) {
-            return true;
+    if (ok_to_ignore_prefix) {
+        if (!checkPrefixIsIgnorable(a, i_a)) {
+            return false;
         }
-        return false;
+        if (!checkPrefixIsIgnorable(b, i_b)) {
+            return false;
+        }
+    } else {
+        // In the US, 1-650-555-1234 must be equal to 650-555-1234,
+        // while 090-1234-1234 must not be equalt to 90-1234-1234 in Japan.
+        // This request exists just in US (with 1 trunk (NDD) prefix).
+        //
+        // At least, in this "rough" comparison, we should ignore the prefix
+        // '1', so if the remaining non-separator number is 0, we ignore it
+        // just once.
+        bool may_be_namp = true;
+        while (i_a >= 0) {
+            const char ch_a = a[i_a];
+            if (isNonSeparator(ch_a)) {
+                if (may_be_namp && tryGetISODigit(ch_a) == 1) {
+                    may_be_namp = false;
+                } else {
+                    return false;
+                }
+            }
+            i_a--;
+        }
+        while (i_b >= 0) {
+            const char ch_b = b[i_b];
+            if (isNonSeparator(ch_b)) {
+                if (may_be_namp && tryGetISODigit(ch_b) == 1) {
+                    may_be_namp = false;
+                } else {
+                    return false;
+                }
+            }
+            i_b--;
+        }
     }
 
-    // At least one string has matched completely;
-    if (matched >= MIN_MATCH && (ia < 0 || ib < 0)) {
-        return true;
-    }
-
-    /*
-     * Now, what remains must be one of the following for a 
-     * match:
-     *
-     *  - a '+' on one and a '00' or a '011' on the other
-     *  - a '0' on one and a (+,00)<country code> on the other
-     *     (for this, a '0' and a '00' prefix would have succeeded above)
-     */
-
-    if (matchIntlPrefix(a, ia + 1) && matchIntlPrefix(b, ib +1)) {
-        return true;
-    }
-
-    if (matchTrunkPrefix(a, ia + 1) && matchIntlPrefixAndCC(b, ib +1)) {
-        return true;
-    }
-
-    if (matchTrunkPrefix(b, ib + 1) && matchIntlPrefixAndCC(a, ia +1)) {
-        return true;
-    }
-
-    /*
-     * Last resort: if the number of unmatched characters on both sides is less than or equal
-     * to the length of the longest country code and only one number starts with a + accept
-     * the match. This is because some countries like France and Russia have an extra prefix
-     * digit that is used when dialing locally in country that does not show up when you dial
-     * the number using the country code. In France this prefix digit is used to determine
-     * which land line carrier to route the call over.
-     */
-    bool aPlusFirst = (*a == '+');
-    bool bPlusFirst = (*b == '+');
-    if (ia < 4 && ib < 4 && (aPlusFirst || bPlusFirst) && !(aPlusFirst && bPlusFirst)) {
-        return true;
-    }
-
-    return false;
+    return true;
 }
 
 } // namespace android
diff --git a/android/PhoneNumberUtils.h b/android/PhoneNumberUtils.h
index 1a5720f..8f350a7 100644
--- a/android/PhoneNumberUtils.h
+++ b/android/PhoneNumberUtils.h
@@ -19,7 +19,7 @@
 #define _ANDROID_PHONE_NUMBER_UTILS_H
 
 namespace android {
-    
+
 bool phone_number_compare(const char* a, const char* b);
 
 }
diff --git a/android/PhoneNumberUtilsTest.cpp b/android/PhoneNumberUtilsTest.cpp
new file mode 100644
index 0000000..6772fe5
--- /dev/null
+++ b/android/PhoneNumberUtilsTest.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Note that similar (or almost same) tests exist in Java side (See
+ * DatabaseGeneralTest.java in AndroidTests). The differences are:
+ * - this test is quite easy to do (You can do it in your Unix PC)
+ * - this test is not automatically executed by build servers
+ *
+ * You should also execute the test before submitting this.
+ */
+
+#include "PhoneNumberUtils.h"
+
+#include <stdio.h>
+#include <string.h>
+
+using namespace android;
+
+#define EXPECT(function, input1, input2, expected, total, error)        \
+    ({                                                                  \
+        const char *i1_cache = input1;                                  \
+        const char *i2_cache = input2;                                  \
+        (total)++;                                                      \
+        if ((expected) != (function)((i1_cache), (i2_cache))) {         \
+            if (expected) {                                             \
+                printf("%s != %s while we expect %s == %s\n",           \
+                       (i1_cache), (i2_cache), (i1_cache), (i2_cache)); \
+            } else {                                                    \
+                printf("%s == %s while we expect %s != %s\n",           \
+                       (i1_cache), (i2_cache), (i1_cache), (i2_cache)); \
+            }                                                           \
+            (error)++;                                                  \
+        }                                                               \
+    })
+
+#define EXPECT_EQ(input1, input2)                                       \
+    EXPECT(phone_number_compare, (input1), (input2), true,              \
+           (total), (error))
+
+
+#define EXPECT_NE(input1, input2)                                       \
+    EXPECT(phone_number_compare, (input1), (input2), false,             \
+           (total), (error))
+
+int main() {
+    int total = 0;
+    int error = 0;
+
+    EXPECT_EQ(NULL, NULL);
+    EXPECT_EQ("", NULL);
+    EXPECT_EQ(NULL, "");
+    EXPECT_EQ("", "");
+
+    EXPECT_EQ("999", "999");
+    EXPECT_EQ("119", "119");
+
+    EXPECT_NE("123456789", "923456789");
+    EXPECT_NE("123456789", "123456781");
+    EXPECT_NE("123456789", "1234567890");
+    EXPECT_NE("123456789", "0123456789");
+
+    // Google, Inc.
+    EXPECT_EQ("650-253-0000", "6502530000");
+    EXPECT_EQ("650-253-0000", "650 253 0000");
+    EXPECT_EQ("650 253 0000", "6502530000");
+
+    // trunk (NDD) prefix must be properly handled in US
+    EXPECT_EQ("650-253-0000", "1-650-253-0000");
+    EXPECT_EQ("650-253-0000", "   1-650-253-0000");
+    EXPECT_NE("650-253-0000", "11-650-253-0000");
+    EXPECT_NE("650-253-0000", "0-650-253-0000");
+
+    EXPECT_EQ("+1 650-253-0000", "6502530000");
+    EXPECT_EQ("001 650-253-0000", "6502530000");
+    EXPECT_EQ("0111 650-253-0000", "6502530000");
+
+    // Country code is different.
+    EXPECT_NE("+19012345678", "+819012345678");
+
+    // Russian trunk digit
+    EXPECT_EQ("+79161234567", "89161234567");
+
+    // French trunk digit
+    EXPECT_EQ("+33123456789", "0123456789");
+
+    // Trunk digit for city codes in the Netherlands
+    EXPECT_EQ("+31771234567", "0771234567");
+
+    // Japanese dial
+    EXPECT_EQ("090-1234-5678", "+819012345678");
+    EXPECT_EQ("090(1234)5678", "+819012345678");
+    EXPECT_EQ("090-1234-5678", "+81-90-1234-5678");
+
+    // Trunk prefix must not be ignored in Japan
+    EXPECT_NE("090-1234-5678", "90-1234-5678");
+
+    EXPECT_NE("090-1234-5678", "080-1234-5678");
+    EXPECT_NE("090-1234-5678", "190-1234-5678");
+    EXPECT_NE("090-1234-5678", "890-1234-5678");
+    EXPECT_NE("+81-90-1234-5678", "+81-090-1234-5678");
+
+    EXPECT_EQ("+593(800)123-1234", "8001231234");
+
+    // Two continuous 0 at the beginieng of the phone string should not be
+    // treated as trunk prefix.
+    EXPECT_NE("008001231234", "8001231234");
+
+    // Test broken caller ID seen on call from Thailand to the US
+    EXPECT_EQ("+66811234567", "166811234567");
+
+    // Confirm that the bug found before does not re-appear.
+    EXPECT_NE("080-1234-5678", "+819012345678");
+
+    // Currently we cannot get this test through (Japanese trunk prefix is 0,
+    // but there is no sensible way to know it now (as of 2009-6-12)...
+    // EXPECT_NE("290-1234-5678", "+819012345678");
+
+    printf("total: %d, error: %d\n\n", total, error);
+    if (error == 0) {
+        printf("Success!\n");
+    } else {
+        printf("Failure... :(\n");
+    }
+}
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
index 5f8781c..da5767f 100644
--- a/android/PhoneticStringUtils.cpp
+++ b/android/PhoneticStringUtils.cpp
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 
 #include "PhoneticStringUtils.h"
+#include <utils/String8.h>
 
 // We'd like 0 length string last of sorted list. So when input string is NULL
 // or 0 length string, we use these instead.
@@ -30,59 +31,158 @@
 
 namespace android {
 
-int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
-    if (src == NULL || len <= index) {
-        return -1;
+// Get hiragana from halfwidth katakana.
+static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint,
+                                            char32_t next_codepoint,
+                                            bool *next_is_consumed) {
+    if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
+        return codepoint;
     }
 
-    if ((src[index] >> 7) == 0) {
-        if (next != NULL) {
-            *next = index + 1;
-        }
-        return src[index];
-    }
-    if ((src[index] & 64) == 0) {
-        return -1;
-    }
-    int mask;
-    size_t num_to_read;
-    for (num_to_read = 1, mask = 64;  // 01000000
-         num_to_read < 7 && (src[index] & mask) == mask;
-         num_to_read++, mask >>= 1) {
-    }
-    if (num_to_read == 7) {
-        return -1;
-    }
-
-    if (num_to_read + index > len) {
-        return -1;
-    }
-
-    {
-        size_t i;
-        for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
-            mask = (mask << 1) + 1;
-        }
-    }
-
-    int codepoint = mask & src[index];
-
-    for (size_t i = 1; i < num_to_read; i++) {
-        if ((src[i + index] & 192) != 128) {  // must be 10xxxxxx
-            return -1;
-        }
-        codepoint = (codepoint << 6) + (src[i + index] & 63);
-    }
-
-    if (next != NULL) {
-        *next = index + num_to_read;
+    switch (codepoint) {
+        case 0xFF66: // wo
+            return 0x3092;
+        case 0xFF67: // xa
+            return 0x3041;
+        case 0xFF68: // xi
+            return 0x3043;
+        case 0xFF69: // xu
+            return 0x3045;
+        case 0xFF6A: // xe
+            return 0x3047;
+        case 0xFF6B: // xo
+            return 0x3049;
+        case 0xFF6C: // xya
+            return 0x3083;
+        case 0xFF6D: // xyu
+            return 0x3085;
+        case 0xFF6E: // xyo
+            return 0x3087;
+        case 0xFF6F: // xtsu
+            return 0x3063;
+        case 0xFF70: // -
+            return 0x30FC;
+        case 0xFF9C: // wa
+            return 0x308F;
+        case 0xFF9D: // n
+            return 0x3093;
+            break;
+        default:   {
+            if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
+                // a, i, u, e, o
+                if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
+                    if (next_is_consumed != NULL) {
+                        *next_is_consumed = true;
+                    }
+                    return 0x3094; // vu
+                } else {
+                    return 0x3042 + (codepoint - 0xFF71) * 2;
+                }
+            } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
+                // ka - chi
+                if (next_codepoint == 0xFF9E) {
+                    // "dakuten" (voiced mark)
+                    if (next_is_consumed != NULL) {
+                        *next_is_consumed = true;
+                    }
+                    return 0x304B + (codepoint - 0xFF76) * 2 + 1;
+                } else {
+                    return 0x304B + (codepoint - 0xFF76) * 2;
+                }
+            } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
+                // tsu, te, to (skip xtsu)
+                if (next_codepoint == 0xFF9E) {
+                    // "dakuten" (voiced mark)
+                    if (next_is_consumed != NULL) {
+                        *next_is_consumed = true;
+                    }
+                    return 0x3064 + (codepoint - 0xFF82) * 2 + 1;
+                } else {
+                    return 0x3064 + (codepoint - 0xFF82) * 2;
+                }
+            } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
+                // na, ni, nu, ne, no
+                return 0x306A + (codepoint - 0xFF85);
+            } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
+                // ha, hi, hu, he, ho
+                if (next_codepoint == 0xFF9E) {
+                    // "dakuten" (voiced mark)
+                    if (next_is_consumed != NULL) {
+                        *next_is_consumed = true;
+                    }
+                    return 0x306F + (codepoint - 0xFF8A) * 3 + 1;
+                } else if (next_codepoint == 0xFF9F) {
+                    // "han-dakuten" (half voiced mark)
+                    if (next_is_consumed != NULL) {
+                        *next_is_consumed = true;
+                    }
+                    return 0x306F + (codepoint - 0xFF8A) * 3 + 2;
+                } else {
+                    return 0x306F + (codepoint - 0xFF8A) * 3;
+                }
+            } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
+                // ma, mi, mu, me, mo
+                return 0x307E + (codepoint - 0xFF8F);
+            } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
+                // ya, yu, yo
+                return 0x3084 + (codepoint - 0xFF94) * 2;
+            } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
+                // ra, ri, ru, re, ro
+                return 0x3089 + (codepoint - 0xFF97);
+            }
+            // Note: 0xFF9C, 0xFF9D are handled above
+        } // end of default
     }
 
     return codepoint;
 }
 
-int GetPhoneticallySortableCodePoint(int codepoint,
-                                     int next_codepoint,
+// Assuming input is hiragana, convert the hiragana to "normalized" hiragana.
+static int GetNormalizedHiragana(int codepoint) {
+    if (codepoint < 0x3040 || 0x309F < codepoint) {
+        return codepoint;
+    }
+
+    // TODO: should care (semi-)voiced mark (0x3099, 0x309A).
+
+    // Trivial kana conversions.
+    // e.g. xa => a
+    switch (codepoint) {
+        case 0x3041:
+        case 0x3043:
+        case 0x3045:
+        case 0x3047:
+        case 0x3049:
+        case 0x308E: // xwa
+            return codepoint + 1;
+        case 0x3095: // xka
+            return 0x304B;
+        case 0x3096: // xku
+            return 0x304F;
+        default:
+            return codepoint;
+    }
+}
+
+static int GetNormalizedKana(char32_t codepoint,
+                             char32_t next_codepoint,
+                             bool *next_is_consumed) {
+    // First, convert fullwidth katakana and halfwidth katakana to hiragana.
+    if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
+        // Make fullwidth katakana same as hiragana.
+        // 96 == 0x30A1 - 0x3041c
+        codepoint = codepoint - 96;
+    } else {
+        codepoint = GetHiraganaFromHalfwidthKatakana(
+                codepoint, next_codepoint, next_is_consumed);
+    }
+
+    // Normalize Hiragana.
+    return GetNormalizedHiragana(codepoint);
+}
+
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+                                     char32_t next_codepoint,
                                      bool *next_is_consumed) {
     if (next_is_consumed != NULL) {
         *next_is_consumed = false;
@@ -149,208 +249,42 @@
 
     // Below is Kana-related handling.
 
-    // First, convert fullwidth katakana and halfwidth katakana to hiragana
-    if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
-        // Make fullwidth katakana same as hiragana.
-        // 96 == 0x30A1 - 0x3041c
-        codepoint = codepoint - 96;
-    } else if (0xFF66 <= codepoint && codepoint <= 0xFF9F) {
-        // Make halfwidth katakana same as hiragana
-        switch (codepoint) {
-            case 0xFF66: // wo
-                codepoint = 0x3092;
-                break;
-            case 0xFF67: // xa
-                codepoint = 0x3041;
-                break;
-            case 0xFF68: // xi
-                codepoint = 0x3043;
-                break;
-            case 0xFF69: // xu
-                codepoint = 0x3045;
-                break;
-            case 0xFF6A: // xe
-                codepoint = 0x3047;
-                break;
-            case 0xFF6B: // xo
-                codepoint = 0x3049;
-                break;
-            case 0xFF6C: // xya
-                codepoint = 0x3083;
-                break;
-            case 0xFF6D: // xyu
-                codepoint = 0x3085;
-                break;
-            case 0xFF6E: // xyo
-                codepoint = 0x3087;
-                break;
-            case 0xFF6F: // xtsu
-                codepoint = 0x3063;
-                break;
-            case 0xFF70: // -
-                codepoint = 0x30FC;
-                break;
-            case 0xFF9C: // wa
-                codepoint = 0x308F;
-                break;
-            case 0xFF9D: // n
-                codepoint = 0x3093;
-                break;
-            default:
-                {
-                    if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
-                        // a, i, u, e, o
-                        if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
-                            if (next_is_consumed != NULL) {
-                                *next_is_consumed = true;
-                            }
-                            codepoint = 0x3094; // vu
-                        } else {
-                            codepoint = 0x3042 + (codepoint - 0xFF71) * 2;
-                        }
-                    } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
-                        // ka - chi
-                        if (next_codepoint == 0xFF9E) {
-                            // "dakuten" (voiced mark)
-                            if (next_is_consumed != NULL) {
-                                *next_is_consumed = true;
-                            }
-                            codepoint = 0x304B + (codepoint - 0xFF76) * 2 + 1;
-                        } else {
-                            codepoint = 0x304B + (codepoint - 0xFF76) * 2;
-                        }
-                    } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
-                        // tsu, te, to (skip xtsu)
-                        if (next_codepoint == 0xFF9E) {
-                            // "dakuten" (voiced mark)
-                            if (next_is_consumed != NULL) {
-                                *next_is_consumed = true;
-                            }
-                            codepoint = 0x3064 + (codepoint - 0xFF82) * 2 + 1;
-                        } else {
-                            codepoint = 0x3064 + (codepoint - 0xFF82) * 2;
-                        }
-                    } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
-                        // na, ni, nu, ne, no
-                        codepoint = 0x306A + (codepoint - 0xFF85);
-                    } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
-                        // ha, hi, hu, he, ho
-                        if (next_codepoint == 0xFF9E) {
-                            // "dakuten" (voiced mark)
-                            if (next_is_consumed != NULL) {
-                                *next_is_consumed = true;
-                            }
-                            codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 1;
-                        } else if (next_codepoint == 0xFF9F) {
-                            // "han-dakuten" (half voiced mark)
-                            if (next_is_consumed != NULL) {
-                                *next_is_consumed = true;
-                            }
-                            codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 2;
-                        } else {
-                            codepoint = 0x306F + (codepoint - 0xFF8A) * 3;
-                        }
-                    } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
-                        // ma, mi, mu, me, mo
-                        codepoint = 0x307E + (codepoint - 0xFF8F);
-                    } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
-                        // ya, yu, yo
-                        codepoint = 0x3084 + (codepoint - 0xFF94) * 2;
-                    } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
-                        // ra, ri, ru, re, ro
-                        codepoint = 0x3089 + (codepoint - 0xFF97);
-                    }
-                    // Note: 0xFF9C, 0xFF9D are handled above
-                } // end of default
-        } // end of case
-    }
-
-    // Trivial kana conversions.
-    // e.g. xa => a
-    switch (codepoint) {
-        case 0x3041:
-        case 0x3043:
-        case 0x3045:
-        case 0x3047:
-        case 0x3049:
-        case 0x308E: // xwa
-            codepoint++;
-            break;
-        case 0x3095: // xka
-            codepoint = 0x304B;
-            break;
-        case 0x3096: // xku
-            codepoint = 0x304F;
-            break;
-    }
-
-    return codepoint;
+    return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
 }
 
-bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
-    if (codepoint < 128) {  // 1 << 7
-        if (*index >= len) {
-            return false;
-        }
-        // 0xxxxxxx
-        dst[*index] = static_cast<char>(codepoint);
-        (*index)++;
-    } else if (codepoint < 2048) {  // 1 << (6 + 5)
-        if (*index + 1 >= len) {
-            return false;
-        }
-        // 110xxxxx
-        dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else if (codepoint < 65536) {  // 1 << (6 * 2 + 4)
-        if (*index + 2 >= len) {
-            return false;
-        }
-        // 1110xxxx
-        dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else if (codepoint < 2097152) {  // 1 << (6 * 3 + 3)
-        if (*index + 3 >= len) {
-            return false;
-        }
-        // 11110xxx
-        dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else if (codepoint < 67108864) {  // 1 << (6 * 2 + 2)
-        if (*index + 4 >= len) {
-            return false;
-        }
-        // 111110xx
-        dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else {
-        if (*index + 5 >= len) {
-            return false;
-        }
-        // 1111110x
-        dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
+int GetNormalizedCodePoint(char32_t codepoint,
+                           char32_t next_codepoint,
+                           bool *next_is_consumed) {
+    if (next_is_consumed != NULL) {
+        *next_is_consumed = false;
     }
-    return true;
+
+    if (codepoint <= 0x0020 || codepoint == 0x3000) {
+        // Whitespaces. Keep it as is.
+        return codepoint;
+    } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
+               (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
+        // Ascii and fullwidth ascii. Keep it as is
+        return codepoint;
+    } else if (codepoint == 0x02DC || codepoint == 0x223C) {
+        // tilde
+        return 0xFF5E;
+    } else if (codepoint <= 0x3040 ||
+               (0x3100 <= codepoint && codepoint < 0xFF00) ||
+               codepoint == CODEPOINT_FOR_NULL_STR) {
+        // Keep it as is.
+        return codepoint;
+    }
+
+    // Below is Kana-related handling.
+
+    return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
 }
 
-bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len){
-    if (dst == NULL || len == NULL) {
+static bool GetExpectedString(
+    const char *src, char **dst, size_t *dst_len,
+    int (*get_codepoint_function)(char32_t, char32_t, bool*)) {
+    if (dst == NULL || dst_len == NULL) {
         return false;
     }
 
@@ -358,100 +292,63 @@
         src = STR_FOR_NULL_STR;
     }
 
-    size_t src_len = strlen(src);
-    int codepoints[MAX_CODEPOINTS];
-    size_t new_len = 0;
+    char32_t codepoints[MAX_CODEPOINTS];
 
-    size_t codepoint_index;
-    {
-        int i, next;
-        for (codepoint_index = 0, i = 0, next = 0;
-             static_cast<size_t>(i) < src_len &&
-                     codepoint_index < MAX_CODEPOINTS;
-             i = next) {
-            int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
-            if (codepoint <= 0) {
-                return false;
-            }
-            int tmp_next;
-            int next_codepoint = GetCodePointFromUtf8(src, src_len,
-                                                      next, &tmp_next);
-            bool next_is_consumed = false;
-
-            // It is ok even if next_codepoint is negative.
-            codepoints[codepoint_index] =
-                    GetPhoneticallySortableCodePoint(codepoint,
-                                                     next_codepoint,
-                                                     &next_is_consumed);
-            // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
-            if (next_is_consumed) {
-                next = tmp_next;
-            }
-
-            if (codepoints[codepoint_index] < 0) {
-              // Do not increment codepoint_index.
-              continue;
-            }
-
-            if (codepoints[codepoint_index] < 128) {  // 1 << 7
-                new_len++;
-            } else if (codepoints[codepoint_index] < 2048) {
-                // 1 << (6 + 5)
-                new_len += 2;
-            } else if (codepoints[codepoint_index] < 65536) {
-                // 1 << (6 * 2 + 4)
-                new_len += 3;
-            } else if (codepoints[codepoint_index] < 2097152) {
-                // 1 << (6 * 3 + 3)
-                new_len += 4;
-            } else if (codepoints[codepoint_index] < 67108864) {
-                // 1 << (6 * 2 + 2)
-                new_len += 5;
-            } else {
-                new_len += 6;
-            }
-
-            codepoint_index++;
+    size_t src_len = utf8_length(src);
+    if (src_len == 0) {
+        return false;
+    }
+    bool next_is_consumed;
+    size_t j = 0;
+    for (size_t i = 0; i < src_len;) {
+        int32_t ret = utf32_at(src, src_len, i, &i);
+        if (ret < 0) {
+            // failed to parse UTF-8
+            return false;
+        }
+        ret = get_codepoint_function(
+                static_cast<char32_t>(ret),
+                i + 1 < src_len ? codepoints[i + 1] : 0,
+                &next_is_consumed);
+        if (ret > 0) {
+            codepoints[j] = static_cast<char32_t>(ret);
+            j++;
+        }
+        if (next_is_consumed) {
+            i++;
         }
     }
+    size_t length = j;
 
-    if (codepoint_index == 0) {
+    if (length == 0) {
         // If all of codepoints are invalid, we place the string at the end of
         // the list.
         codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
-        codepoint_index = 1;
-        new_len = 4;
+        length = 1;
     }
 
-    new_len += 1;  // For '\0'.
-
-    *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
+    size_t new_len = utf8_length_from_utf32(codepoints, length);
+    *dst = static_cast<char *>(malloc(new_len + 1));
     if (*dst == NULL) {
         return false;
     }
 
-    size_t ch_index;
-    {
-        size_t i;
-        for (i = 0, ch_index = 0; i < codepoint_index; i++) {
-            if (!GetUtf8FromCodePoint(codepoints[i], *dst,
-                                      new_len, &ch_index)) {
-                free(*dst);
-                *dst = NULL;
-                return false;
-            }
-        }
-    }
-
-    if (ch_index != new_len - 1) {
+    if (utf32_to_utf8(codepoints, length, *dst, new_len + 1) != new_len) {
         free(*dst);
         *dst = NULL;
         return false;
     }
 
-    (*dst)[new_len - 1] = '\0';
-    *len = new_len;
+    *dst_len = new_len;
     return true;
 }
 
+bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len) {
+    return GetExpectedString(src, dst, len, GetPhoneticallySortableCodePoint);
+}
+
+bool GetNormalizedString(const char *src, char **dst, size_t *len) {
+    return GetExpectedString(src, dst, len, GetNormalizedCodePoint);
+}
+
 }  // namespace android
diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h
index 7ebf9e0..9da7d29 100644
--- a/android/PhoneticStringUtils.h
+++ b/android/PhoneticStringUtils.h
@@ -18,6 +18,7 @@
 #define _ANDROID_PHONETIC_STRING_UTILS_H
 
 #include <string.h>  // For size_t.
+#include <utils/String8.h>
 
 namespace android {
 
@@ -31,10 +32,23 @@
 // is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
 // when previous "codepoint" is appropriate). If the codepoint should not be
 // considered when sorting (e.g. whitespaces), -1 is returned.
-int GetPhoneticallySortableCodePoint(int codepoint,
-                                     int next_codepoint,
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+                                     char32_t next_codepoint,
                                      bool *next_is_consumed);
 
+// Returns codepoint which is "normalized", whose definition depends on each
+// Locale. Note that currently this function normalizes only Japanese; the
+// other characters are remained as is.
+// The variable "next_is_consumed" is set to true if "next_codepoint"
+// is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
+// when previous "codepoint" is appropriate, like half-width "ka").
+//
+// In Japanese, "normalized" means that half-width and full-width katakana is
+// appropriately converted to hiragana.
+int GetNormalizedCodePoint(char32_t codepoint,
+                           char32_t next_codepoint,
+                           bool *next_is_consumed);
+
 // Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful.
 // If input is invalid or the length of the destination is not enough,
 // returns false.
@@ -47,6 +61,13 @@
 // Note that currently this function considers only Japanese.
 bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len);
 
+// Creates a "normalized" Utf8 string and push it into "dst". *dst must be
+// freed after being used outside.
+// If "src" is NULL or its length is 0, "dst" is set to \uFFFF.
+//
+// Note that currently this function considers only Japanese.
+bool GetNormalizedString(const char *src, char **dst, size_t *len);
+
 }  // namespace android
 
 #endif
diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp
index 0541007..356342e 100644
--- a/android/PhoneticStringUtilsTest.cpp
+++ b/android/PhoneticStringUtilsTest.cpp
@@ -20,6 +20,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <utils/String8.h>
+
 using namespace android;
 
 class TestExecutor {
@@ -29,13 +31,14 @@
  private:
   void DoOneTest(void (TestExecutor::*test)());
 
-  void testGetCodePointFromUtf8();
+  void testUtf32At();
   void testGetPhoneticallySortableCodePointAscii();
   void testGetPhoneticallySortableCodePointKana();
   void testGetPhoneticallySortableCodePointWhitespaceOnly();
   void testGetPhoneticallySortableCodePointSimpleCompare();
-  void testGetUtf8FromCodePoint();
+  void testGetUtf8FromUtf32();
   void testGetPhoneticallySortableString();
+  void testGetNormalizedString();
 
   // Note: When adding a test, do not forget to add it to DoOneTest().
 
@@ -64,13 +67,14 @@
 
 
 bool TestExecutor::DoAllTests() {
-  DoOneTest(&TestExecutor::testGetCodePointFromUtf8);
+  DoOneTest(&TestExecutor::testUtf32At);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
-  DoOneTest(&TestExecutor::testGetUtf8FromCodePoint);
+  DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
+  DoOneTest(&TestExecutor::testGetNormalizedString);
 
   printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
          m_total_count, m_success_count, m_total_count - m_success_count);
@@ -90,36 +94,45 @@
   m_success_count += m_success ? 1 : 0;
 }
 
-void TestExecutor::testGetCodePointFromUtf8() {
-  printf("testGetCodePointFromUtf8()\n");
-  int next;
+#define TEST_GET_UTF32AT(src, index, expected_next, expected_value)     \
+  ({                                                                    \
+    size_t next;                                                        \
+    int32_t ret = utf32_at(src, strlen(src), index, &next);   \
+    if (ret < 0) {                                                      \
+      printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
+             (src), (index));                                           \
+      m_success = false;                                                \
+    } else if (next != (expected_next)) {                               \
+      printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
+             (src), next, (expected_next));                             \
+    } else {                                                            \
+      EXPECT_EQ_VALUE(ret, (expected_value));                           \
+    }                                                                   \
+   })
 
-  EXPECT_EQ_VALUE(GetCodePointFromUtf8("a", 1, 0, &next), 97);
-  EXPECT_EQ_VALUE(next, 1);
+void TestExecutor::testUtf32At() {
+  printf("testUtf32At()\n");
+
+  TEST_GET_UTF32AT("a", 0, 1, 97);
   // Japanese hiragana "a"
-  EXPECT_EQ_VALUE(GetCodePointFromUtf8("\xE3\x81\x82", 3, 0, &next), 0x3042);
-  EXPECT_EQ_VALUE(next, 3);
+  TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
   // Japanese fullwidth katakana "a" with ascii a
-  EXPECT_EQ_VALUE(GetCodePointFromUtf8("a\xE3\x82\xA2", 4, 1, &next), 0x30A2);
-  EXPECT_EQ_VALUE(next, 4);
+  TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
 
   // 2 PUA
-  ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
-                                       8, 0, &next), 0xFE000);
-  ASSERT_EQ_VALUE(next, 4);
-  ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
-                                       8, next, &next), 0xFE008);
-  ASSERT_EQ_VALUE(next, 8);
+  TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
+  TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
 }
 
 void TestExecutor::testGetPhoneticallySortableCodePointAscii() {
   printf("testGetPhoneticallySortableCodePoint()\n");
   int halfwidth[94];
   int fullwidth[94];
-  int i, codepoint;
+  int i;
+  char32_t codepoint;
   bool next_is_consumed;
   for (i = 0, codepoint = 0x0021; codepoint <= 0x007E; ++i, ++codepoint) {
-    halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+    halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
                                                     &next_is_consumed);
     if (halfwidth[i] < 0) {
       printf("returned value become negative at 0x%04X", codepoint);
@@ -133,7 +146,7 @@
     }
   }
   for (i = 0, codepoint = 0xFF01; codepoint <= 0xFF5E; ++i, ++codepoint) {
-    fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+    fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
                                                     &next_is_consumed);
     if (fullwidth[i] < 0) {
       printf("returned value become negative at 0x%04X", codepoint);
@@ -156,11 +169,12 @@
   printf("testGetPhoneticallySortableCodePointKana()\n");
   int hiragana[86];
   int fullwidth_katakana[86];
-  int i, codepoint;
+  int i;
+  char32_t codepoint;
   bool next_is_consumed;
 
   for (i = 0, codepoint = 0x3041; codepoint <= 0x3096; ++i, ++codepoint) {
-    hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+    hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
                                                    &next_is_consumed);
     if (hiragana[i] < 0) {
       printf("returned value become negative at 0x%04X", codepoint);
@@ -175,7 +189,7 @@
   }
 
   for (i = 0, codepoint = 0x30A1; codepoint <= 0x30F6; ++i, ++codepoint) {
-    fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+    fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
                                                    &next_is_consumed);
     if (fullwidth_katakana[i] < 0) {
       printf("returned value become negative at 0x%04X", codepoint);
@@ -192,7 +206,7 @@
   // hankaku-katakana space do not have some characters corresponding to
   // zenkaku-hiragana (e.g. xwa, xka, xku). To make test easier, insert
   // zenkaku-katakana version of them into this array (See the value 0x30??).
-  int halfwidth_katakana[] = {
+  char32_t halfwidth_katakana[] = {
     0xFF67, 0xFF71, 0xFF68, 0xFF72, 0xFF69, 0xFF73, 0xFF6A, 0xFF74, 0xFF6B,
     0xFF75, 0xFF76, 0xFF76, 0xFF9E, 0xFF77, 0xFF77, 0xFF9E, 0xFF78, 0xFF78,
     0xFF9E, 0xFF79, 0xFF79, 0xFF9E, 0xFF7A, 0xFF7A, 0xFF9E, 0xFF7B, 0xFF7B,
@@ -212,8 +226,8 @@
 
   int j;
   for (i = 0, j = 0; i < len && j < 86; ++i, ++j) {
-    int codepoint = halfwidth_katakana[i];
-    int next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : -1;
+    char32_t codepoint = halfwidth_katakana[i];
+    char32_t next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : 0;
     halfwidth_katakana_result[j] =
         GetPhoneticallySortableCodePoint(codepoint, next_codepoint,
                                          &next_is_consumed);
@@ -232,7 +246,7 @@
 }
 
 void TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly() {
-  printf("testGetPhoneticallySortableCodePointWhitespaceOnly");
+  printf("testGetPhoneticallySortableCodePointWhitespaceOnly()\n");
   // Halfwidth space
   int result = GetPhoneticallySortableCodePoint(0x0020, 0x0061, NULL);
   ASSERT_EQ_VALUE(result, -1);
@@ -247,7 +261,7 @@
 void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() {
   printf("testGetPhoneticallySortableCodePointSimpleCompare()\n");
 
-  int codepoints[] = {
+  char32_t codepoints[] = {
     0x3042, 0x30AB, 0xFF7B, 0x305F, 0x30CA, 0xFF8A, 0x30D0, 0x3071,
     0x307E, 0x30E4, 0xFF97, 0x308F, 0x3093, 0x3094, 'A', 'Z',
     '0', '9', '!', '/', ':', '?', '[', '`', '{', '~'};
@@ -255,7 +269,7 @@
   bool next_is_consumed;
   for (size_t i = 0; i < len - 1; ++i) {
     int codepoint_a =
-        GetPhoneticallySortableCodePoint(codepoints[i], -1,
+        GetPhoneticallySortableCodePoint(codepoints[i], 0,
                                          &next_is_consumed);
     if (next_is_consumed) {
       printf("next_is_consumed become true at 0x%04X", codepoint_a);
@@ -263,7 +277,7 @@
       return;
     }
     int codepoint_b =
-        GetPhoneticallySortableCodePoint(codepoints[i + 1], -1,
+        GetPhoneticallySortableCodePoint(codepoints[i + 1], 0,
                                          &next_is_consumed);
     if (next_is_consumed) {
       printf("next_is_consumed become true at 0x%04X", codepoint_b);
@@ -280,20 +294,18 @@
   }
 }
 
-#define EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, i)     \
+#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected)                   \
   ({                                                                    \
-    index = i;                                                          \
-    if (!GetUtf8FromCodePoint(codepoint, dst, 10, &index)) {            \
+    char32_t codepoints[1] = {codepoint};                                \
+    status_t ret = string8.setTo(codepoints, 1);                        \
+    if (ret != NO_ERROR) {                                              \
       printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
       m_success = false;                                                \
-    } else if (index >= 10) {                                           \
-      printf("index (%d) >= 10\n", index);                              \
-      m_success = false;                                                \
     } else {                                                            \
-      dst[index] = '\0';                                                \
-      if (strcmp(dst + i, expected) != 0) {                             \
+      const char* string = string8.string();                            \
+      if (strcmp(string, expected) != 0) {                              \
         printf("Failed at codepoint 0x%04X\n", codepoint);              \
-        for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
+        for (const char *ch = string; *ch != '\0'; ++ch) {              \
           printf("0x%X ", *ch);                                         \
         }                                                               \
         printf("!= ");                                                  \
@@ -306,14 +318,9 @@
     }                                                                   \
   })
 
-#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected)          \
-  EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, 0)
-
-
-void TestExecutor::testGetUtf8FromCodePoint() {
-  printf("testGetUtf8FromCodePoint()\n");
-  size_t index = 0;
-  char dst[10];
+void TestExecutor::testGetUtf8FromUtf32() {
+  printf("testGetUtf8FromUtf32()\n");
+  String8 string8;
 
   EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
   // Armenian capital letter AYB (2 bytes in UTF8)
@@ -325,15 +332,6 @@
   // PUA (4 byets in UTF8)
   EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
   EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
-
-  EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(0x058F, "\xD6\x8F", 3);
-
-  index = 0;
-  if (GetUtf8FromCodePoint(0x3043, dst, 2, &index)) {
-    printf("GetUtf8FromCodePont() returned true even when destination length"
-           "is not enough\n");
-    m_success = false;
-  }
 }
 
 #define EXPECT_EQ_UTF8_UTF8(src, expected)                              \
@@ -358,6 +356,7 @@
    })
 
 void TestExecutor::testGetPhoneticallySortableString() {
+  printf("testGetPhoneticallySortableString()\n");
   char *dst;
   size_t len;
 
@@ -373,6 +372,49 @@
   EXPECT_EQ_UTF8_UTF8("    \t", "\xF0\x9F\xBF\xBD");
 }
 
+#undef EXPECT_EQ_UTF8_UTF8
+
+#define EXPECT_EQ_UTF8_UTF8(src, expected)                              \
+  ({                                                                    \
+    if (!GetNormalizedString(src, &dst, &len)) {                        \
+      printf("GetPhoneticallySortableString() returned false.\n");      \
+      m_success = false;                                                \
+    } else {                                                            \
+      if (strcmp(dst, expected) != 0) {                                 \
+        for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
+          printf("0x%X ", *ch);                                         \
+        }                                                               \
+        printf("!= ");                                                  \
+        for (const char *ch = expected; *ch != '\0'; ++ch) {            \
+          printf("0x%X ", *ch);                                         \
+        }                                                               \
+        printf("\n");                                                   \
+        m_success = false;                                              \
+      }                                                                 \
+      free(dst);                                                        \
+    }                                                                   \
+   })
+
+void TestExecutor::testGetNormalizedString() {
+  printf("testGetNormalizedString()\n");
+  char *dst;
+  size_t len;
+
+  // halfwidth alphabets/symbols -> keep it as is.
+  EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()",
+                      "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()");
+  EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/",
+                      "abcdefghijklmnopqrstuvwxyz[]{}\\@/");
+
+  // halfwidth/fullwidth-katakana -> hiragana
+  EXPECT_EQ_UTF8_UTF8(
+      "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
+      "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
+
+  // whitespace -> keep it as is.
+  EXPECT_EQ_UTF8_UTF8("    \t", "    \t");
+}
+
 int main() {
   TestExecutor executor;
   if(executor.DoAllTests()) {
diff --git a/android/sqlite3_android.cpp b/android/sqlite3_android.cpp
index 27334ef..252a0c5 100644
--- a/android/sqlite3_android.cpp
+++ b/android/sqlite3_android.cpp
@@ -90,6 +90,25 @@
     }
 }
 
+static void get_normalized_string(
+    sqlite3_context * context, int argc, sqlite3_value ** argv)
+{
+    if (argc != 1) {
+      sqlite3_result_null(context);
+      return;
+    }
+    char const * src = (char const *)sqlite3_value_text(argv[0]);
+    char * ret;
+    size_t len;
+
+    if (!android::GetNormalizedString(src, &ret, &len)) {
+        // Probably broken string. Return 0 length string.
+        sqlite3_result_text(context, "", -1, SQLITE_STATIC);
+    } else {
+        sqlite3_result_text(context, ret, len, free);
+    }
+}
+
 static void phone_numbers_equal(sqlite3_context * context, int argc, sqlite3_value ** argv)
 {
     if (argc != 2) {
@@ -161,7 +180,11 @@
         sqlite3_result_null(context);
         return;        
     }
-    
+    if (strstr(path, "/../") != NULL) {
+        sqlite3_result_null(context);
+        return;
+    }
+
     int err = unlink(path);
     if (err != -1) {
         // No error occured, return true
@@ -196,23 +219,53 @@
 /**
  * This function is invoked as:
  *
- *  _TOKENIZE('<token_table>', <data_row_id>, <data>, <delimiter>)
+ *  _TOKENIZE('<token_table>', <data_row_id>, <data>, <delimiter>,
+ *             <use_token_index>, <data_tag>)
  *
- * It will then split data on each instance of delimiter and insert each token
- * into token_table's 'token' column with data_row_id in the 'source' column.
+ * If <use_token_index> is omitted, it is treated as 0.
+ * If <data_tag> is omitted, it is treated as NULL.
+ *
+ * It will split <data> on each instance of <delimiter> and insert each token
+ * into <token_table>. The following columns in <token_table> are used:
+ * token TEXT, source INTEGER, token_index INTEGER, tag (any type)
+ * The token_index column is not required if <use_token_index> is 0.
+ * The tag column is not required if <data_tag> is NULL.
+ *
+ * One row is inserted for each token in <data>.
+ * In each inserted row, 'source' is <data_row_id>.
+ * In the first inserted row, 'token' is the hex collation key of
+ * the entire <data> string, and 'token_index' is 0.
+ * In each row I (where 1 <= I < N, and N is the number of tokens in <data>)
+ * 'token' will be set to the hex collation key of the I:th token (0-based).
+ * If <use_token_index> != 0, 'token_index' is set to I.
+ * If <data_tag> is not NULL, 'tag' is set to <data_tag>.
+ *
+ * In other words, there will be one row for the entire string,
+ * and one row for each token except the first one.
+ *
  * The function returns the number of tokens generated.
  */
 static void tokenize(sqlite3_context * context, int argc, sqlite3_value ** argv)
 {
     //LOGD("enter tokenize");
     int err;
+    int useTokenIndex = 0;
+    int useDataTag = 0;
 
-    if (argc != 4) {
-        LOGE("Tokenize requires 4 arguments");
+    if (!(argc >= 4 || argc <= 6)) {
+        LOGE("Tokenize requires 4 to 6 arguments");
         sqlite3_result_null(context);
         return;
     }
 
+    if (argc > 4) {
+        useTokenIndex = sqlite3_value_int(argv[4]);
+    }
+
+    if (argc > 5) {
+        useDataTag = (sqlite3_value_type(argv[5]) != SQLITE_NULL);
+    }
+
     sqlite3 * handle = sqlite3_context_db_handle(context);
     UCollator* collator = (UCollator*)sqlite3_user_data(context);
     char const * tokenTable = (char const *)sqlite3_value_text(argv[0]);
@@ -225,7 +278,12 @@
     // Get or create the prepared statement for the insertions
     sqlite3_stmt * statement = (sqlite3_stmt *)sqlite3_get_auxdata(context, 0);
     if (!statement) {
-        char * sql = sqlite3_mprintf("INSERT INTO %s (token, source) VALUES (?, ?);", tokenTable);
+        char const * tokenIndexCol = useTokenIndex ? ", token_index" : "";
+        char const * tokenIndexParam = useTokenIndex ? ", ?" : "";
+        char const * dataTagCol = useDataTag ? ", tag" : "";
+        char const * dataTagParam = useDataTag ? ", ?" : "";
+        char * sql = sqlite3_mprintf("INSERT INTO %s (token, source%s%s) VALUES (?, ?%s%s);",
+                tokenTable, tokenIndexCol, dataTagCol, tokenIndexParam, dataTagParam);
         err = sqlite3_prepare_v2(handle, sql, -1, &statement, NULL);
         sqlite3_free(sql);
         if (err) {
@@ -251,6 +309,17 @@
         return;
     }
 
+    // Bind <data_tag> to the tag column
+    if (useDataTag) {
+        int dataTagParamIndex = useTokenIndex ? 4 : 3;
+        err = sqlite3_bind_value(statement, dataTagParamIndex, argv[5]);
+        if (err != SQLITE_OK) {
+            LOGE("bind failed");
+            sqlite3_result_null(context);
+            return;
+        }
+    }
+
     // Get the raw bytes for the string to tokenize
     // the string will be modified by following code
     // however, sqlite did not reuse the string, so it is safe to not dup it
@@ -299,6 +368,15 @@
             break;
         }
 
+        if (useTokenIndex) {
+            err = sqlite3_bind_int(statement, 3, numTokens);
+            if (err != SQLITE_OK) {
+                LOGE(" sqlite3_bind_int error %d", err);
+                free(base16buf);
+                break;
+            }
+        }
+
         err = sqlite3_step(statement);
         free(base16buf);
 
@@ -357,7 +435,15 @@
     err = sqlite3_create_function(handle, "_TOKENIZE", 4, SQLITE_UTF16, collator, tokenize, NULL, NULL);
     if (err != SQLITE_OK) {
         return err;
-    }    
+    }
+    err = sqlite3_create_function(handle, "_TOKENIZE", 5, SQLITE_UTF16, collator, tokenize, NULL, NULL);
+    if (err != SQLITE_OK) {
+        return err;
+    }
+    err = sqlite3_create_function(handle, "_TOKENIZE", 6, SQLITE_UTF16, collator, tokenize, NULL, NULL);
+    if (err != SQLITE_OK) {
+        return err;
+    }
 
     return SQLITE_OK;
 }
@@ -422,5 +508,15 @@
         return err;
     }
 
+    // Register the GET_NORMALIZED_STRING function
+    err = sqlite3_create_function(handle,
+                                  "GET_NORMALIZED_STRING",
+                                  1, SQLITE_UTF8, NULL,
+                                  get_normalized_string,
+                                  NULL, NULL);
+    if (err != SQLITE_OK) {
+        return err;
+    }
+
     return SQLITE_OK;
 }
diff --git a/dist/Android.mk b/dist/Android.mk
index 431533f..0200276 100644
--- a/dist/Android.mk
+++ b/dist/Android.mk
@@ -29,7 +29,8 @@
 LOCAL_C_INCLUDES += $(call include-path-for, system-core)/cutils
 LOCAL_SHARED_LIBRARIES += liblog \
             libicuuc \
-            libicui18n
+            libicui18n \
+            libutils
 
 # include android specific methods
 LOCAL_WHOLE_STATIC_LIBRARIES := libsqlite3_android