Make PhoneticStringUtils use functions in String8 instead of using locally developped functions.
This change depends on https://android-git.corp.google.com/g/Gerrit#change,5510
Internal bug id: 1707173
diff --git a/android/Android.mk b/android/Android.mk
index a9f68da..44d77b6 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -11,7 +11,6 @@
external/icu4c/i18n \
external/icu4c/common
-
LOCAL_MODULE:= libsqlite3_android
include $(BUILD_STATIC_LIBRARY)
@@ -29,6 +28,9 @@
LOCAL_MODULE_TAGS := optional
+LOCAL_SHARED_LIBRARIES := \
+ libutils
+
include $(BUILD_EXECUTABLE)
# Test for PhoneNumberUtils
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
index 24b1647..fa32d38 100644
--- a/android/PhoneticStringUtils.cpp
+++ b/android/PhoneticStringUtils.cpp
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include "PhoneticStringUtils.h"
+#include <utils/String8.h>
// We'd like 0 length string last of sorted list. So when input string is NULL
// or 0 length string, we use these instead.
@@ -30,60 +31,9 @@
namespace android {
-int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
- if (src == NULL || len <= index) {
- return -1;
- }
-
- if ((src[index] >> 7) == 0) {
- if (next != NULL) {
- *next = index + 1;
- }
- return src[index];
- }
- if ((src[index] & 64) == 0) {
- return -1;
- }
- int mask;
- size_t num_to_read;
- for (num_to_read = 1, mask = 64; // 01000000
- num_to_read < 7 && (src[index] & mask) == mask;
- num_to_read++, mask >>= 1) {
- }
- if (num_to_read == 7) {
- return -1;
- }
-
- if (num_to_read + index > len) {
- return -1;
- }
-
- {
- size_t i;
- for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
- mask = (mask << 1) + 1;
- }
- }
-
- int codepoint = mask & src[index];
-
- for (size_t i = 1; i < num_to_read; i++) {
- if ((src[i + index] & 192) != 128) { // must be 10xxxxxx
- return -1;
- }
- codepoint = (codepoint << 6) + (src[i + index] & 63);
- }
-
- if (next != NULL) {
- *next = index + num_to_read;
- }
-
- return codepoint;
-}
-
// Get hiragana from halfwidth katakana.
-static int GetHiraganaFromHalfwidthKatakana(int codepoint,
- int next_codepoint,
+static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
return codepoint;
@@ -214,8 +164,8 @@
}
}
-static int GetNormalizedKana(int codepoint,
- int next_codepoint,
+static int GetNormalizedKana(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
// First, convert fullwidth katakana and halfwidth katakana to hiragana.
if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
@@ -231,8 +181,8 @@
return GetNormalizedHiragana(codepoint);
}
-int GetPhoneticallySortableCodePoint(int codepoint,
- int next_codepoint,
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
if (next_is_consumed != NULL) {
*next_is_consumed = false;
@@ -302,8 +252,8 @@
return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
}
-int GetNormalizedCodePoint(int codepoint,
- int next_codepoint,
+int GetNormalizedCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
if (next_is_consumed != NULL) {
*next_is_consumed = false;
@@ -331,73 +281,10 @@
return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
}
-
-bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
- if (codepoint < 128) { // 1 << 7
- if (*index >= len) {
- return false;
- }
- // 0xxxxxxx
- dst[*index] = static_cast<char>(codepoint);
- (*index)++;
- } else if (codepoint < 2048) { // 1 << (6 + 5)
- if (*index + 1 >= len) {
- return false;
- }
- // 110xxxxx
- dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 65536) { // 1 << (6 * 2 + 4)
- if (*index + 2 >= len) {
- return false;
- }
- // 1110xxxx
- dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 2097152) { // 1 << (6 * 3 + 3)
- if (*index + 3 >= len) {
- return false;
- }
- // 11110xxx
- dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 67108864) { // 1 << (6 * 2 + 2)
- if (*index + 4 >= len) {
- return false;
- }
- // 111110xx
- dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else {
- if (*index + 5 >= len) {
- return false;
- }
- // 1111110x
- dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- }
- return true;
-}
-
static bool GetExpectedString(
- const char *src, char **dst, size_t *len,
- int (*get_codepoint_function)(int, int, bool*)) {
- if (dst == NULL || len == NULL) {
+ const char *src, char **dst, size_t *dst_len,
+ int (*get_codepoint_function)(char32_t, char32_t, bool*)) {
+ if (dst == NULL || dst_len == NULL) {
return false;
}
@@ -405,99 +292,55 @@
src = STR_FOR_NULL_STR;
}
- size_t src_len = strlen(src);
- int codepoints[MAX_CODEPOINTS];
- size_t new_len = 0;
+ char32_t codepoints[MAX_CODEPOINTS];
- size_t codepoint_index;
- {
- int i, next;
- for (codepoint_index = 0, i = 0, next = 0;
- static_cast<size_t>(i) < src_len &&
- codepoint_index < MAX_CODEPOINTS;
- i = next) {
- int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
- if (codepoint <= 0) {
- return false;
- }
- int tmp_next;
- int next_codepoint = GetCodePointFromUtf8(src, src_len,
- next, &tmp_next);
- bool next_is_consumed = false;
-
- // It is ok even if next_codepoint is negative.
- codepoints[codepoint_index] =
- get_codepoint_function(codepoint,
- next_codepoint,
- &next_is_consumed);
- // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
- if (next_is_consumed) {
- next = tmp_next;
- }
-
- if (codepoints[codepoint_index] < 0) {
- // Do not increment codepoint_index.
- continue;
- }
-
- if (codepoints[codepoint_index] < 128) { // 1 << 7
- new_len++;
- } else if (codepoints[codepoint_index] < 2048) {
- // 1 << (6 + 5)
- new_len += 2;
- } else if (codepoints[codepoint_index] < 65536) {
- // 1 << (6 * 2 + 4)
- new_len += 3;
- } else if (codepoints[codepoint_index] < 2097152) {
- // 1 << (6 * 3 + 3)
- new_len += 4;
- } else if (codepoints[codepoint_index] < 67108864) {
- // 1 << (6 * 2 + 2)
- new_len += 5;
- } else {
- new_len += 6;
- }
-
- codepoint_index++;
+ size_t src_len = GetUtf8LengthOrZero(src);
+ if (src_len == 0) {
+ return false;
+ }
+ bool next_is_consumed;
+ size_t j = 0;
+ for (size_t i = 0; i < src_len;) {
+ int32_t ret = GetUtf32AtFromUtf8(src, src_len, i, &i);
+ if (ret < 0) {
+ // failed to parse UTF-8
+ return false;
+ }
+ ret = get_codepoint_function(
+ static_cast<char32_t>(ret),
+ i + 1 < src_len ? codepoints[i + 1] : 0,
+ &next_is_consumed);
+ if (ret > 0) {
+ codepoints[j] = static_cast<char32_t>(ret);
+ j++;
+ }
+ if (next_is_consumed) {
+ i++;
}
}
+ size_t length = j;
- if (codepoint_index == 0) {
+ if (length == 0) {
// If all of codepoints are invalid, we place the string at the end of
// the list.
codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
- codepoint_index = 1;
- new_len = 4;
+ length = 1;
}
- new_len += 1; // For '\0'.
-
- *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
+ size_t new_len = GetUtf8LengthFromUtf32(codepoints, length);
+ *dst = static_cast<char *>(malloc(new_len + 1));
if (*dst == NULL) {
return false;
}
- size_t ch_index;
- {
- size_t i;
- for (i = 0, ch_index = 0; i < codepoint_index; i++) {
- if (!GetUtf8FromCodePoint(codepoints[i], *dst,
- new_len, &ch_index)) {
- free(*dst);
- *dst = NULL;
- return false;
- }
- }
- }
-
- if (ch_index != new_len - 1) {
+ printf("new_len: %u\n", new_len);
+ if (GetUtf8FromUtf32(codepoints, length, *dst, new_len + 1) != new_len) {
free(*dst);
*dst = NULL;
return false;
}
- (*dst)[new_len - 1] = '\0';
- *len = new_len;
+ *dst_len = new_len;
return true;
}
diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h
index 68a4928..9da7d29 100644
--- a/android/PhoneticStringUtils.h
+++ b/android/PhoneticStringUtils.h
@@ -18,6 +18,7 @@
#define _ANDROID_PHONETIC_STRING_UTILS_H
#include <string.h> // For size_t.
+#include <utils/String8.h>
namespace android {
@@ -31,8 +32,8 @@
// is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
// when previous "codepoint" is appropriate). If the codepoint should not be
// considered when sorting (e.g. whitespaces), -1 is returned.
-int GetPhoneticallySortableCodePoint(int codepoint,
- int next_codepoint,
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed);
// Returns codepoint which is "normalized", whose definition depends on each
@@ -44,8 +45,8 @@
//
// In Japanese, "normalized" means that half-width and full-width katakana is
// appropriately converted to hiragana.
-int GetNormalizedCodePoint(int codepoint,
- int next_codepoint,
+int GetNormalizedCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed);
// Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful.
diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp
index e74f67f..06a7ba8 100644
--- a/android/PhoneticStringUtilsTest.cpp
+++ b/android/PhoneticStringUtilsTest.cpp
@@ -20,6 +20,8 @@
#include <stdlib.h>
#include <string.h>
+#include <utils/String8.h>
+
using namespace android;
class TestExecutor {
@@ -29,12 +31,12 @@
private:
void DoOneTest(void (TestExecutor::*test)());
- void testGetCodePointFromUtf8();
+ void testGetUtf32At();
void testGetPhoneticallySortableCodePointAscii();
void testGetPhoneticallySortableCodePointKana();
void testGetPhoneticallySortableCodePointWhitespaceOnly();
void testGetPhoneticallySortableCodePointSimpleCompare();
- void testGetUtf8FromCodePoint();
+ void testGetUtf8FromUtf32();
void testGetPhoneticallySortableString();
void testGetNormalizedString();
@@ -65,12 +67,12 @@
bool TestExecutor::DoAllTests() {
- DoOneTest(&TestExecutor::testGetCodePointFromUtf8);
+ DoOneTest(&TestExecutor::testGetUtf32At);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
- DoOneTest(&TestExecutor::testGetUtf8FromCodePoint);
+ DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
DoOneTest(&TestExecutor::testGetNormalizedString);
@@ -92,26 +94,35 @@
m_success_count += m_success ? 1 : 0;
}
-void TestExecutor::testGetCodePointFromUtf8() {
- printf("testGetCodePointFromUtf8()\n");
- int next;
+#define TEST_GET_UTF32AT(src, index, expected_next, expected_value) \
+ ({ \
+ size_t next; \
+ String8 string8(src); \
+ int32_t ret = string8.getUtf32At((index), &next); \
+ if (ret < 0) { \
+ printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
+ (src), (index)); \
+ m_success = false; \
+ } else if (next != (expected_next)) { \
+ printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
+ (src), next, (expected_next)); \
+ } else { \
+ EXPECT_EQ_VALUE(ret, (expected_value)); \
+ } \
+ })
- EXPECT_EQ_VALUE(GetCodePointFromUtf8("a", 1, 0, &next), 97);
- EXPECT_EQ_VALUE(next, 1);
+void TestExecutor::testGetUtf32At() {
+ printf("testGetUtf32At()\n");
+
+ TEST_GET_UTF32AT("a", 0, 1, 97);
// Japanese hiragana "a"
- EXPECT_EQ_VALUE(GetCodePointFromUtf8("\xE3\x81\x82", 3, 0, &next), 0x3042);
- EXPECT_EQ_VALUE(next, 3);
+ TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
// Japanese fullwidth katakana "a" with ascii a
- EXPECT_EQ_VALUE(GetCodePointFromUtf8("a\xE3\x82\xA2", 4, 1, &next), 0x30A2);
- EXPECT_EQ_VALUE(next, 4);
+ TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
// 2 PUA
- ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
- 8, 0, &next), 0xFE000);
- ASSERT_EQ_VALUE(next, 4);
- ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
- 8, next, &next), 0xFE008);
- ASSERT_EQ_VALUE(next, 8);
+ TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
+ TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
}
void TestExecutor::testGetPhoneticallySortableCodePointAscii() {
@@ -282,20 +293,18 @@
}
}
-#define EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, i) \
+#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \
({ \
- index = i; \
- if (!GetUtf8FromCodePoint(codepoint, dst, 10, &index)) { \
+ char32_t codepoints[1] = {codepoint}; \
+ status_t ret = string8.setTo(codepoints, 1); \
+ if (ret != NO_ERROR) { \
printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
m_success = false; \
- } else if (index >= 10) { \
- printf("index (%d) >= 10\n", index); \
- m_success = false; \
} else { \
- dst[index] = '\0'; \
- if (strcmp(dst + i, expected) != 0) { \
+ const char* string = string8.string(); \
+ if (strcmp(string, expected) != 0) { \
printf("Failed at codepoint 0x%04X\n", codepoint); \
- for (const char *ch = dst; *ch != '\0'; ++ch) { \
+ for (const char *ch = string; *ch != '\0'; ++ch) { \
printf("0x%X ", *ch); \
} \
printf("!= "); \
@@ -308,14 +317,9 @@
} \
})
-#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \
- EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, 0)
-
-
-void TestExecutor::testGetUtf8FromCodePoint() {
- printf("testGetUtf8FromCodePoint()\n");
- size_t index = 0;
- char dst[10];
+void TestExecutor::testGetUtf8FromUtf32() {
+ printf("testGetUtf8FromUtf32()\n");
+ String8 string8;
EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
// Armenian capital letter AYB (2 bytes in UTF8)
@@ -327,15 +331,6 @@
// PUA (4 byets in UTF8)
EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
-
- EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(0x058F, "\xD6\x8F", 3);
-
- index = 0;
- if (GetUtf8FromCodePoint(0x3043, dst, 2, &index)) {
- printf("GetUtf8FromCodePont() returned true even when destination length"
- "is not enough\n");
- m_success = false;
- }
}
#define EXPECT_EQ_UTF8_UTF8(src, expected) \
diff --git a/dist/Android.mk b/dist/Android.mk
index 1d8e1eb..b7aad30 100644
--- a/dist/Android.mk
+++ b/dist/Android.mk
@@ -29,7 +29,8 @@
LOCAL_C_INCLUDES += $(call include-path-for, system-core)/cutils
LOCAL_SHARED_LIBRARIES += liblog \
libicuuc \
- libicui18n
+ libicui18n \
+ libutils
# include android specific methods
LOCAL_WHOLE_STATIC_LIBRARIES := libsqlite3_android