Make PhoneticStringUtils use functions in String8 instead of using locally developped functions.

This change depends on https://android-git.corp.google.com/g/Gerrit#change,5510

Internal bug id: 1707173
diff --git a/android/Android.mk b/android/Android.mk
index a9f68da..44d77b6 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -11,7 +11,6 @@
         external/icu4c/i18n \
         external/icu4c/common
 
-
 LOCAL_MODULE:= libsqlite3_android
 
 include $(BUILD_STATIC_LIBRARY)
@@ -29,6 +28,9 @@
 
 LOCAL_MODULE_TAGS := optional
 
+LOCAL_SHARED_LIBRARIES := \
+	libutils
+
 include $(BUILD_EXECUTABLE)
 
 # Test for PhoneNumberUtils
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
index 24b1647..fa32d38 100644
--- a/android/PhoneticStringUtils.cpp
+++ b/android/PhoneticStringUtils.cpp
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 
 #include "PhoneticStringUtils.h"
+#include <utils/String8.h>
 
 // We'd like 0 length string last of sorted list. So when input string is NULL
 // or 0 length string, we use these instead.
@@ -30,60 +31,9 @@
 
 namespace android {
 
-int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
-    if (src == NULL || len <= index) {
-        return -1;
-    }
-
-    if ((src[index] >> 7) == 0) {
-        if (next != NULL) {
-            *next = index + 1;
-        }
-        return src[index];
-    }
-    if ((src[index] & 64) == 0) {
-        return -1;
-    }
-    int mask;
-    size_t num_to_read;
-    for (num_to_read = 1, mask = 64;  // 01000000
-         num_to_read < 7 && (src[index] & mask) == mask;
-         num_to_read++, mask >>= 1) {
-    }
-    if (num_to_read == 7) {
-        return -1;
-    }
-
-    if (num_to_read + index > len) {
-        return -1;
-    }
-
-    {
-        size_t i;
-        for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
-            mask = (mask << 1) + 1;
-        }
-    }
-
-    int codepoint = mask & src[index];
-
-    for (size_t i = 1; i < num_to_read; i++) {
-        if ((src[i + index] & 192) != 128) {  // must be 10xxxxxx
-            return -1;
-        }
-        codepoint = (codepoint << 6) + (src[i + index] & 63);
-    }
-
-    if (next != NULL) {
-        *next = index + num_to_read;
-    }
-
-    return codepoint;
-}
-
 // Get hiragana from halfwidth katakana.
-static int GetHiraganaFromHalfwidthKatakana(int codepoint,
-                                            int next_codepoint,
+static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint,
+                                            char32_t next_codepoint,
                                             bool *next_is_consumed) {
     if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
         return codepoint;
@@ -214,8 +164,8 @@
     }
 }
 
-static int GetNormalizedKana(int codepoint,
-                             int next_codepoint,
+static int GetNormalizedKana(char32_t codepoint,
+                             char32_t next_codepoint,
                              bool *next_is_consumed) {
     // First, convert fullwidth katakana and halfwidth katakana to hiragana.
     if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
@@ -231,8 +181,8 @@
     return GetNormalizedHiragana(codepoint);
 }
 
-int GetPhoneticallySortableCodePoint(int codepoint,
-                                     int next_codepoint,
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+                                     char32_t next_codepoint,
                                      bool *next_is_consumed) {
     if (next_is_consumed != NULL) {
         *next_is_consumed = false;
@@ -302,8 +252,8 @@
     return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
 }
 
-int GetNormalizedCodePoint(int codepoint,
-                           int next_codepoint,
+int GetNormalizedCodePoint(char32_t codepoint,
+                           char32_t next_codepoint,
                            bool *next_is_consumed) {
     if (next_is_consumed != NULL) {
         *next_is_consumed = false;
@@ -331,73 +281,10 @@
     return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
 }
 
-
-bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
-    if (codepoint < 128) {  // 1 << 7
-        if (*index >= len) {
-            return false;
-        }
-        // 0xxxxxxx
-        dst[*index] = static_cast<char>(codepoint);
-        (*index)++;
-    } else if (codepoint < 2048) {  // 1 << (6 + 5)
-        if (*index + 1 >= len) {
-            return false;
-        }
-        // 110xxxxx
-        dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else if (codepoint < 65536) {  // 1 << (6 * 2 + 4)
-        if (*index + 2 >= len) {
-            return false;
-        }
-        // 1110xxxx
-        dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else if (codepoint < 2097152) {  // 1 << (6 * 3 + 3)
-        if (*index + 3 >= len) {
-            return false;
-        }
-        // 11110xxx
-        dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else if (codepoint < 67108864) {  // 1 << (6 * 2 + 2)
-        if (*index + 4 >= len) {
-            return false;
-        }
-        // 111110xx
-        dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    } else {
-        if (*index + 5 >= len) {
-            return false;
-        }
-        // 1111110x
-        dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
-        // 10xxxxxx
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
-        dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
-        dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
-    }
-    return true;
-}
-
 static bool GetExpectedString(
-    const char *src, char **dst, size_t *len,
-    int (*get_codepoint_function)(int, int, bool*)) {
-    if (dst == NULL || len == NULL) {
+    const char *src, char **dst, size_t *dst_len,
+    int (*get_codepoint_function)(char32_t, char32_t, bool*)) {
+    if (dst == NULL || dst_len == NULL) {
         return false;
     }
 
@@ -405,99 +292,55 @@
         src = STR_FOR_NULL_STR;
     }
 
-    size_t src_len = strlen(src);
-    int codepoints[MAX_CODEPOINTS];
-    size_t new_len = 0;
+    char32_t codepoints[MAX_CODEPOINTS];
 
-    size_t codepoint_index;
-    {
-        int i, next;
-        for (codepoint_index = 0, i = 0, next = 0;
-             static_cast<size_t>(i) < src_len &&
-                     codepoint_index < MAX_CODEPOINTS;
-             i = next) {
-            int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
-            if (codepoint <= 0) {
-                return false;
-            }
-            int tmp_next;
-            int next_codepoint = GetCodePointFromUtf8(src, src_len,
-                                                      next, &tmp_next);
-            bool next_is_consumed = false;
-
-            // It is ok even if next_codepoint is negative.
-            codepoints[codepoint_index] =
-                    get_codepoint_function(codepoint,
-                                           next_codepoint,
-                                           &next_is_consumed);
-            // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
-            if (next_is_consumed) {
-                next = tmp_next;
-            }
-
-            if (codepoints[codepoint_index] < 0) {
-              // Do not increment codepoint_index.
-              continue;
-            }
-
-            if (codepoints[codepoint_index] < 128) {  // 1 << 7
-                new_len++;
-            } else if (codepoints[codepoint_index] < 2048) {
-                // 1 << (6 + 5)
-                new_len += 2;
-            } else if (codepoints[codepoint_index] < 65536) {
-                // 1 << (6 * 2 + 4)
-                new_len += 3;
-            } else if (codepoints[codepoint_index] < 2097152) {
-                // 1 << (6 * 3 + 3)
-                new_len += 4;
-            } else if (codepoints[codepoint_index] < 67108864) {
-                // 1 << (6 * 2 + 2)
-                new_len += 5;
-            } else {
-                new_len += 6;
-            }
-
-            codepoint_index++;
+    size_t src_len = GetUtf8LengthOrZero(src);
+    if (src_len == 0) {
+        return false;
+    }
+    bool next_is_consumed;
+    size_t j = 0;
+    for (size_t i = 0; i < src_len;) {
+        int32_t ret = GetUtf32AtFromUtf8(src, src_len, i, &i);
+        if (ret < 0) {
+            // failed to parse UTF-8
+            return false;
+        }
+        ret = get_codepoint_function(
+                static_cast<char32_t>(ret),
+                i + 1 < src_len ? codepoints[i + 1] : 0,
+                &next_is_consumed);
+        if (ret > 0) {
+            codepoints[j] = static_cast<char32_t>(ret);
+            j++;
+        }
+        if (next_is_consumed) {
+            i++;
         }
     }
+    size_t length = j;
 
-    if (codepoint_index == 0) {
+    if (length == 0) {
         // If all of codepoints are invalid, we place the string at the end of
         // the list.
         codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
-        codepoint_index = 1;
-        new_len = 4;
+        length = 1;
     }
 
-    new_len += 1;  // For '\0'.
-
-    *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
+    size_t new_len = GetUtf8LengthFromUtf32(codepoints, length);
+    *dst = static_cast<char *>(malloc(new_len + 1));
     if (*dst == NULL) {
         return false;
     }
 
-    size_t ch_index;
-    {
-        size_t i;
-        for (i = 0, ch_index = 0; i < codepoint_index; i++) {
-            if (!GetUtf8FromCodePoint(codepoints[i], *dst,
-                                      new_len, &ch_index)) {
-                free(*dst);
-                *dst = NULL;
-                return false;
-            }
-        }
-    }
-
-    if (ch_index != new_len - 1) {
+    printf("new_len: %u\n", new_len);
+    if (GetUtf8FromUtf32(codepoints, length, *dst, new_len + 1) != new_len) {
         free(*dst);
         *dst = NULL;
         return false;
     }
 
-    (*dst)[new_len - 1] = '\0';
-    *len = new_len;
+    *dst_len = new_len;
     return true;
 }
 
diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h
index 68a4928..9da7d29 100644
--- a/android/PhoneticStringUtils.h
+++ b/android/PhoneticStringUtils.h
@@ -18,6 +18,7 @@
 #define _ANDROID_PHONETIC_STRING_UTILS_H
 
 #include <string.h>  // For size_t.
+#include <utils/String8.h>
 
 namespace android {
 
@@ -31,8 +32,8 @@
 // is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
 // when previous "codepoint" is appropriate). If the codepoint should not be
 // considered when sorting (e.g. whitespaces), -1 is returned.
-int GetPhoneticallySortableCodePoint(int codepoint,
-                                     int next_codepoint,
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+                                     char32_t next_codepoint,
                                      bool *next_is_consumed);
 
 // Returns codepoint which is "normalized", whose definition depends on each
@@ -44,8 +45,8 @@
 //
 // In Japanese, "normalized" means that half-width and full-width katakana is
 // appropriately converted to hiragana.
-int GetNormalizedCodePoint(int codepoint,
-                           int next_codepoint,
+int GetNormalizedCodePoint(char32_t codepoint,
+                           char32_t next_codepoint,
                            bool *next_is_consumed);
 
 // Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful.
diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp
index e74f67f..06a7ba8 100644
--- a/android/PhoneticStringUtilsTest.cpp
+++ b/android/PhoneticStringUtilsTest.cpp
@@ -20,6 +20,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <utils/String8.h>
+
 using namespace android;
 
 class TestExecutor {
@@ -29,12 +31,12 @@
  private:
   void DoOneTest(void (TestExecutor::*test)());
 
-  void testGetCodePointFromUtf8();
+  void testGetUtf32At();
   void testGetPhoneticallySortableCodePointAscii();
   void testGetPhoneticallySortableCodePointKana();
   void testGetPhoneticallySortableCodePointWhitespaceOnly();
   void testGetPhoneticallySortableCodePointSimpleCompare();
-  void testGetUtf8FromCodePoint();
+  void testGetUtf8FromUtf32();
   void testGetPhoneticallySortableString();
   void testGetNormalizedString();
 
@@ -65,12 +67,12 @@
 
 
 bool TestExecutor::DoAllTests() {
-  DoOneTest(&TestExecutor::testGetCodePointFromUtf8);
+  DoOneTest(&TestExecutor::testGetUtf32At);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
-  DoOneTest(&TestExecutor::testGetUtf8FromCodePoint);
+  DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
   DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
   DoOneTest(&TestExecutor::testGetNormalizedString);
 
@@ -92,26 +94,35 @@
   m_success_count += m_success ? 1 : 0;
 }
 
-void TestExecutor::testGetCodePointFromUtf8() {
-  printf("testGetCodePointFromUtf8()\n");
-  int next;
+#define TEST_GET_UTF32AT(src, index, expected_next, expected_value)     \
+  ({                                                                    \
+    size_t next;                                                        \
+    String8 string8(src);                                               \
+    int32_t ret = string8.getUtf32At((index), &next);                   \
+    if (ret < 0) {                                                      \
+      printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
+             (src), (index));                                           \
+      m_success = false;                                                \
+    } else if (next != (expected_next)) {                               \
+      printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
+             (src), next, (expected_next));                             \
+    } else {                                                            \
+      EXPECT_EQ_VALUE(ret, (expected_value));                           \
+    }                                                                   \
+   })
 
-  EXPECT_EQ_VALUE(GetCodePointFromUtf8("a", 1, 0, &next), 97);
-  EXPECT_EQ_VALUE(next, 1);
+void TestExecutor::testGetUtf32At() {
+  printf("testGetUtf32At()\n");
+
+  TEST_GET_UTF32AT("a", 0, 1, 97);
   // Japanese hiragana "a"
-  EXPECT_EQ_VALUE(GetCodePointFromUtf8("\xE3\x81\x82", 3, 0, &next), 0x3042);
-  EXPECT_EQ_VALUE(next, 3);
+  TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
   // Japanese fullwidth katakana "a" with ascii a
-  EXPECT_EQ_VALUE(GetCodePointFromUtf8("a\xE3\x82\xA2", 4, 1, &next), 0x30A2);
-  EXPECT_EQ_VALUE(next, 4);
+  TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
 
   // 2 PUA
-  ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
-                                       8, 0, &next), 0xFE000);
-  ASSERT_EQ_VALUE(next, 4);
-  ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
-                                       8, next, &next), 0xFE008);
-  ASSERT_EQ_VALUE(next, 8);
+  TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
+  TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
 }
 
 void TestExecutor::testGetPhoneticallySortableCodePointAscii() {
@@ -282,20 +293,18 @@
   }
 }
 
-#define EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, i)     \
+#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected)                   \
   ({                                                                    \
-    index = i;                                                          \
-    if (!GetUtf8FromCodePoint(codepoint, dst, 10, &index)) {            \
+    char32_t codepoints[1] = {codepoint};                                \
+    status_t ret = string8.setTo(codepoints, 1);                        \
+    if (ret != NO_ERROR) {                                              \
       printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
       m_success = false;                                                \
-    } else if (index >= 10) {                                           \
-      printf("index (%d) >= 10\n", index);                              \
-      m_success = false;                                                \
     } else {                                                            \
-      dst[index] = '\0';                                                \
-      if (strcmp(dst + i, expected) != 0) {                             \
+      const char* string = string8.string();                            \
+      if (strcmp(string, expected) != 0) {                              \
         printf("Failed at codepoint 0x%04X\n", codepoint);              \
-        for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
+        for (const char *ch = string; *ch != '\0'; ++ch) {              \
           printf("0x%X ", *ch);                                         \
         }                                                               \
         printf("!= ");                                                  \
@@ -308,14 +317,9 @@
     }                                                                   \
   })
 
-#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected)          \
-  EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, 0)
-
-
-void TestExecutor::testGetUtf8FromCodePoint() {
-  printf("testGetUtf8FromCodePoint()\n");
-  size_t index = 0;
-  char dst[10];
+void TestExecutor::testGetUtf8FromUtf32() {
+  printf("testGetUtf8FromUtf32()\n");
+  String8 string8;
 
   EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
   // Armenian capital letter AYB (2 bytes in UTF8)
@@ -327,15 +331,6 @@
   // PUA (4 byets in UTF8)
   EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
   EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
-
-  EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(0x058F, "\xD6\x8F", 3);
-
-  index = 0;
-  if (GetUtf8FromCodePoint(0x3043, dst, 2, &index)) {
-    printf("GetUtf8FromCodePont() returned true even when destination length"
-           "is not enough\n");
-    m_success = false;
-  }
 }
 
 #define EXPECT_EQ_UTF8_UTF8(src, expected)                              \
diff --git a/dist/Android.mk b/dist/Android.mk
index 1d8e1eb..b7aad30 100644
--- a/dist/Android.mk
+++ b/dist/Android.mk
@@ -29,7 +29,8 @@
 LOCAL_C_INCLUDES += $(call include-path-for, system-core)/cutils
 LOCAL_SHARED_LIBRARIES += liblog \
             libicuuc \
-            libicui18n
+            libicui18n \
+            libutils
 
 # include android specific methods
 LOCAL_WHOLE_STATIC_LIBRARIES := libsqlite3_android