Merge commit 'goog/readonly-korg-master' into merge_korg_master
diff --git a/android/Android.mk b/android/Android.mk
index 9a6efc7..44d77b6 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -11,7 +11,6 @@
external/icu4c/i18n \
external/icu4c/common
-
LOCAL_MODULE:= libsqlite3_android
include $(BUILD_STATIC_LIBRARY)
@@ -27,6 +26,33 @@
PhoneticStringUtils.cpp \
PhoneticStringUtilsTest.cpp
-LOCAL_MODULE_TAGS := tests optional
+LOCAL_MODULE_TAGS := optional
+
+LOCAL_SHARED_LIBRARIES := \
+ libutils
+
+include $(BUILD_EXECUTABLE)
+
+# Test for PhoneNumberUtils
+#
+# You can also test this in Unix, like this:
+# > g++ -Wall external/sqlite/android/PhoneNumberUtils.cpp \
+# external/sqlite/android/PhoneNumberUtilsTest.cpp
+# > ./a.out
+#
+# Note: tests related to PHONE_NUMBERS_EQUAL also exists in AndroidTests in
+# java space. Add tests if you modify this.
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= libsqlite3_phone_number_utils_test
+
+LOCAL_CFLAGS += -Wall -Werror
+
+LOCAL_SRC_FILES := \
+ PhoneNumberUtils.cpp \
+ PhoneNumberUtilsTest.cpp
+
+LOCAL_MODULE_TAGS := optional
include $(BUILD_EXECUTABLE)
diff --git a/android/PhoneNumberUtils.cpp b/android/PhoneNumberUtils.cpp
index 9e5e470..cb8552e 100644
--- a/android/PhoneNumberUtils.cpp
+++ b/android/PhoneNumberUtils.cpp
@@ -1,293 +1,383 @@
-/* //device/vmlibs-android/com.android.internal.telephony/PhoneNumberUtils.java
-**
-** Copyright 2006, The Android Open Source Project
-**
-** Licensed under the Apache License, Version 2.0 (the "License");
-** you may not use this file except in compliance with the License.
-** You may obtain a copy of the License at
-**
-** http://www.apache.org/licenses/LICENSE-2.0
-**
-** Unless required by applicable law or agreed to in writing, software
-** distributed under the License is distributed on an "AS IS" BASIS,
-** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-** See the License for the specific language governing permissions and
-** limitations under the License.
-*/
+/*
+ * Copyright 2009, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
#include <string.h>
namespace android {
-static int MIN_MATCH = 5;
+/* Generated by the following Python script. Values of country calling codes
+ are from http://en.wikipedia.org/wiki/List_of_country_calling_codes
-/** True if c is ISO-LATIN characters 0-9 */
-static bool isISODigit (char c)
-{
- return c >= '0' && c <= '9';
-}
+#!/usr/bin/python
+import sys
+ccc_set_2digits = set([0, 1, 7,
+ 20, 27, 28, 30, 31, 32, 33, 34, 36, 39, 40, 43, 44, 45,
+ 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61,
+ 62, 63, 64, 65, 66, 81, 82, 83, 84, 86, 89, 90, 91, 92,
+ 93, 94, 95, 98])
-/** True if c is ISO-LATIN characters 0-9, *, # , + */
-static bool isNonSeparator(char c)
-{
- return (c >= '0' && c <= '9') || c == '*' || c == '#' || c == '+';
+ONE_LINE_NUM = 10
+
+for i in xrange(100):
+ if i % ONE_LINE_NUM == 0:
+ sys.stdout.write(' ')
+ if i in ccc_set_2digits:
+ included = 'true'
+ else:
+ included = 'false'
+ sys.stdout.write(included + ',')
+ if ((i + 1) % ONE_LINE_NUM) == 0:
+ sys.stdout.write('\n')
+ else:
+ sys.stdout.write(' ')
+*/
+static bool two_length_country_code_map[100] = {
+ true, true, false, false, false, false, false, true, false, false,
+ false, false, false, false, false, false, false, false, false, false,
+ true, false, false, false, false, false, false, true, true, false,
+ true, true, true, true, true, false, true, false, false, true,
+ true, false, false, true, true, true, true, true, true, true,
+ false, true, true, true, true, true, true, true, true, false,
+ true, true, true, true, true, true, true, false, false, false,
+ false, false, false, false, false, false, false, false, false, false,
+ false, true, true, true, true, false, true, false, false, true,
+ true, true, true, true, true, true, false, false, true, false,
+};
+
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof((a)[0]))
+
+/**
+ * Returns true if "ccc_candidate" expresses (part of ) some country calling
+ * code.
+ * Returns false otherwise.
+ */
+static bool isCountryCallingCode(int ccc_candidate) {
+ return ccc_candidate > 0 &&
+ ccc_candidate < (int)ARRAY_SIZE(two_length_country_code_map) &&
+ two_length_country_code_map[ccc_candidate];
}
/**
- * Phone numbers are stored in "lookup" form in the database
- * as reversed strings to allow for caller ID lookup
- *
- * This method takes a phone number and makes a valid SQL "LIKE"
- * string that will match the lookup form
- *
+ * Returns interger corresponding to the input if input "ch" is
+ * ISO-LATIN characters 0-9.
+ * Returns -1 otherwise
*/
-/** all of a up to len must be an international prefix or
- * separators/non-dialing digits
- */
-static bool matchIntlPrefix(const char* a, int len)
+static int tryGetISODigit (char ch)
{
- /* '([^0-9*#+]\+[^0-9*#+] | [^0-9*#+]0(0|11)[^0-9*#+] )$' */
- /* 0 1 2 3 45 */
-
- int state = 0;
- for (int i = 0 ; i < len ; i++) {
- char c = a[i];
-
- switch (state) {
- case 0:
- if (c == '+') state = 1;
- else if (c == '0') state = 2;
- else if (isNonSeparator(c)) return false;
- break;
-
- case 2:
- if (c == '0') state = 3;
- else if (c == '1') state = 4;
- else if (isNonSeparator(c)) return false;
- break;
-
- case 4:
- if (c == '1') state = 5;
- else if (isNonSeparator(c)) return false;
- break;
-
- default:
- if (isNonSeparator(c)) return false;
- break;
-
- }
+ if ('0' <= ch && ch <= '9') {
+ return ch - '0';
+ } else {
+ return -1;
}
-
- return state == 1 || state == 3 || state == 5;
}
-/** all of 'a' up to len must match non-US trunk prefix ('0') */
-static bool matchTrunkPrefix(const char* a, int len)
+/** True if c is ISO-LATIN characters 0-9, *, # , + */
+static bool isNonSeparator(char ch)
{
- bool found;
+ return ('0' <= ch && ch <= '9') || ch == '*' || ch == '#' || ch == '+';
+}
- found = false;
-
- for (int i = 0 ; i < len ; i++) {
- char c = a[i];
-
- if (c == '0' && !found) {
- found = true;
- } else if (isNonSeparator(c)) {
+/**
+ * Try to store the pointer to "new_ptr" which does not have trunk prefix.
+ *
+ * Currently this function simply ignore the first digit assuming it is
+ * trunk prefix. Actually trunk prefix is different in each country.
+ *
+ * e.g.
+ * "+79161234567" equals "89161234567" (Russian trunk digit is 8)
+ * "+33123456789" equals "0123456789" (French trunk digit is 0)
+ *
+ */
+static bool tryGetTrunkPrefixOmittedStr(const char *str, size_t len,
+ const char **new_ptr, size_t *new_len)
+{
+ for (size_t i = 0 ; i < len ; i++) {
+ char ch = str[i];
+ if (tryGetISODigit(ch) >= 0) {
+ if (new_ptr != NULL) {
+ *new_ptr = str + i + 1;
+ }
+ if (new_len != NULL) {
+ *new_len = len - (i + 1);
+ }
+ return true;
+ } else if (isNonSeparator(ch)) {
return false;
}
}
-
- return found;
+
+ return false;
}
-/** all of 'a' up to len must be a (+|00|011)country code)
- * We're fast and loose with the country code. Any \d{1,3} matches */
-static bool matchIntlPrefixAndCC(const char* a, int len)
+/*
+ * Note that this function does not strictly care the country calling code with
+ * 3 length (like Morocco: +212), assuming it is enough to use the first two
+ * digit to compare two phone numbers.
+ */
+static int tryGetCountryCallingCode(const char *str, size_t len,
+ const char **new_ptr, size_t *new_len)
{
- /* [^0-9*#+]*(\+|0(0|11)\d\d?\d? [^0-9*#+] $ */
- /* 0 1 2 3 45 6 7 8 */
+ // Rough regexp:
+ // ^[^0-9*#+]*((\+|0(0|11)\d\d?|166) [^0-9*#+] $
+ // 0 1 2 3 45 6 7 89
+ //
+ // In all the states, this function ignores separator characters.
+ // "166" is the special case for the call from Thailand to the US. Ugu!
int state = 0;
- for (int i = 0 ; i < len ; i++ ) {
- char c = a[i];
-
+ int ccc = 0;
+ for (size_t i = 0 ; i < len ; i++ ) {
+ char ch = str[i];
switch (state) {
case 0:
- if (c == '+') state = 1;
- else if (c == '0') state = 2;
- else if (isNonSeparator(c)) return false;
+ if (ch == '+') state = 1;
+ else if (ch == '0') state = 2;
+ else if (ch == '1') state = 8;
+ else if (isNonSeparator(ch)) return -1;
break;
case 2:
- if (c == '0') state = 3;
- else if (c == '1') state = 4;
- else if (isNonSeparator(c)) return false;
+ if (ch == '0') state = 3;
+ else if (ch == '1') state = 4;
+ else if (isNonSeparator(ch)) return -1;
break;
case 4:
- if (c == '1') state = 5;
- else if (isNonSeparator(c)) return false;
+ if (ch == '1') state = 5;
+ else if (isNonSeparator(ch)) return -1;
break;
case 1:
case 3:
case 5:
- if (isISODigit(c)) state = 6;
- else if (isNonSeparator(c)) return false;
- break;
-
case 6:
case 7:
- if (isISODigit(c)) state++;
- else if (isNonSeparator(c)) return false;
- break;
-
+ {
+ int ret = tryGetISODigit(ch);
+ if (ret > 0) {
+ ccc = ccc * 10 + ret;
+ if (ccc >= 100 || isCountryCallingCode(ccc)) {
+ if (new_ptr != NULL) {
+ *new_ptr = str + i + 1;
+ }
+ if (new_len != NULL) {
+ *new_len = len - (i + 1);
+ }
+ return ccc;
+ }
+ if (state == 1 || state == 3 || state == 5) {
+ state = 6;
+ } else {
+ state++;
+ }
+ } else if (isNonSeparator(ch)) {
+ return -1;
+ }
+ }
+ break;
+ case 8:
+ if (ch == '6') state = 9;
+ else if (isNonSeparator(ch)) return -1;
+ break;
+ case 9:
+ if (ch == '6') {
+ if (new_ptr != NULL) {
+ *new_ptr = str + i + 1;
+ }
+ if (new_len != NULL) {
+ *new_len = len - (i + 1);
+ }
+ return 66;
+ }
+ break;
default:
- if (isNonSeparator(c)) return false;
+ return -1;
}
}
- return state == 6 || state == 7 || state == 8;
-}
-
-/** or -1 if both are negative */
-static int minPositive(int a, int b)
-{
- if (a >= 0 && b >= 0) {
- return (a < b) ? a : b;
- } else if (a >= 0) { /* && b < 0 */
- return a;
- } else if (b >= 0) { /* && a < 0 */
- return b;
- } else { /* a < 0 && b < 0 */
- return -1;
- }
+ return -1;
}
/**
- * Return the offset into a of the first appearance of b, or -1 if there
- * is no such character in a.
+ * Return true if the prefix of "ch" is "ignorable". Here, "ignorable" means
+ * that "ch" has only one digit and separater characters. The one digit is
+ * assumed to be trunk prefix.
*/
-static int indexOf(const char *a, char b) {
- char *ix = strchr(a, b);
+static bool checkPrefixIsIgnorable(const char* ch, int i) {
+ bool trunk_prefix_was_read = false;
+ while (i >= 0) {
+ if (tryGetISODigit(ch[i]) >= 0) {
+ if (trunk_prefix_was_read) {
+ // More than one digit appeared, meaning that "a" and "b"
+ // is different.
+ return false;
+ } else {
+ // Ignore just one digit, assuming it is trunk prefix.
+ trunk_prefix_was_read = true;
+ }
+ } else if (isNonSeparator(ch[i])) {
+ // Trunk prefix is a digit, not "*", "#"...
+ return false;
+ }
+ i--;
+ }
- if (ix == NULL)
- return -1;
- else
- return ix - a;
+ return true;
}
/**
* Compare phone numbers a and b, return true if they're identical
* enough for caller ID purposes.
*
- * - Compares from right to left
- * - requires MIN_MATCH (5) characters to match
- * - handles common trunk prefixes and international prefixes
- * (basically, everything except the Russian trunk prefix)
+ * Assume NULL as 0-length string.
*
- * Tolerates nulls
+ * Detailed information:
+ * Currently (as of 2009-06-12), we cannot depend on the locale given from the
+ * OS. For example, current Android does not accept "en_JP", meaning
+ * "the display language is English but the phone should be in Japan", but
+ * en_US, es_US, etc. So we cannot identify which digit is valid trunk prefix
+ * in the country where the phone is used. More specifically, "880-1234-1234"
+ * is not valid phone number in Japan since the trunk prefix in Japan is not 8
+ * but 0 (correct number should be "080-1234-1234"), while Russian trunk prefix
+ * is 8. Also, we cannot know whether the country where users live has trunk
+ * prefix itself. So, we cannot determine whether "+81-80-1234-1234" is NOT
+ * same as "880-1234-1234" (while "+81-80-1234-1234" is same as "080-1234-1234"
+ * and we can determine "880-1234-1234" is different from "080-1234-1234").
+ *
+ * In the future, we should handle trunk prefix more correctly, but as of now,
+ * we just ignore it...
*/
bool phone_number_compare(const char* a, const char* b)
{
- int ia, ib;
- int matched;
-
- if (a == NULL || b == NULL) {
- return false;
+ size_t len_a = 0;
+ size_t len_b = 0;
+ if (a == NULL) {
+ a = "";
+ } else {
+ len_a = strlen(a);
+ }
+ if (b == NULL) {
+ b = "";
+ } else {
+ len_b = strlen(b);
}
- ia = strlen(a);
- ib = strlen(b);
- if (ia == 0 || ib == 0) {
- return false;
+ const char* tmp_a = NULL;
+ const char* tmp_b = NULL;
+ size_t tmp_len_a = len_a;
+ size_t tmp_len_b = len_b;
+
+ int ccc_a = tryGetCountryCallingCode(a, len_a, &tmp_a, &tmp_len_a);
+ int ccc_b = tryGetCountryCallingCode(b, len_b, &tmp_b, &tmp_len_b);
+ bool ok_to_ignore_prefix = true;
+ if (ccc_a >= 0 && ccc_b >= 0) {
+ if (ccc_a != ccc_b) {
+ // Different Country Calling Code. Must be different phone number.
+ return false;
+ }
+ // When both have ccc, do not ignore trunk prefix. Without this,
+ // "+81123123" becomes same as "+810123123" (+81 == Japan)
+ ok_to_ignore_prefix = false;
+ } else if (ccc_a < 0 && ccc_b < 0) {
+ // When both do not have ccc, do not ignore trunk prefix. Without this,
+ // "123123" becomes same as "0123123"
+ ok_to_ignore_prefix = false;
+ } else {
+ if (ccc_a < 0) {
+ tryGetTrunkPrefixOmittedStr(a, len_a, &tmp_a, &tmp_len_a);
+ }
+ if (ccc_b < 0) {
+ tryGetTrunkPrefixOmittedStr(b, len_b, &tmp_b, &tmp_len_b);
+ }
}
- // Compare from right to left
- ia--;
- ib--;
+ if (tmp_a != NULL) {
+ a = tmp_a;
+ len_a = tmp_len_a;
+ }
+ if (tmp_b != NULL) {
+ b = tmp_b;
+ len_b = tmp_len_b;
+ }
- matched = 0;
-
- while (ia >= 0 && ib >=0) {
- char ca, cb;
- bool skipCmp = false;
-
- ca = a[ia];
-
- if (!isNonSeparator(ca)) {
- ia--;
- skipCmp = true;
+ int i_a = len_a - 1;
+ int i_b = len_b - 1;
+ while (i_a >= 0 && i_b >= 0) {
+ bool skip_compare = false;
+ char ch_a = a[i_a];
+ char ch_b = b[i_b];
+ if (!isNonSeparator(ch_a)) {
+ i_a--;
+ skip_compare = true;
+ }
+ if (!isNonSeparator(ch_b)) {
+ i_b--;
+ skip_compare = true;
}
- cb = b[ib];
-
- if (!isNonSeparator(cb)) {
- ib--;
- skipCmp = true;
- }
-
- if (!skipCmp) {
- if (cb != ca) {
- break;
+ if (!skip_compare) {
+ if (ch_a != ch_b) {
+ return false;
}
- ia--; ib--; matched++;
+ i_a--;
+ i_b--;
}
}
- if (matched < MIN_MATCH) {
- int aLen = strlen(a);
-
- // if the input strings match, but their lengths < MIN_MATCH,
- // treat them as equal.
- if (aLen == (int)strlen(b) && aLen == matched) {
- return true;
+ if (ok_to_ignore_prefix) {
+ if (!checkPrefixIsIgnorable(a, i_a)) {
+ return false;
}
- return false;
+ if (!checkPrefixIsIgnorable(b, i_b)) {
+ return false;
+ }
+ } else {
+ // In the US, 1-650-555-1234 must be equal to 650-555-1234,
+ // while 090-1234-1234 must not be equalt to 90-1234-1234 in Japan.
+ // This request exists just in US (with 1 trunk (NDD) prefix).
+ //
+ // At least, in this "rough" comparison, we should ignore the prefix
+ // '1', so if the remaining non-separator number is 0, we ignore it
+ // just once.
+ bool may_be_namp = true;
+ while (i_a >= 0) {
+ const char ch_a = a[i_a];
+ if (isNonSeparator(ch_a)) {
+ if (may_be_namp && tryGetISODigit(ch_a) == 1) {
+ may_be_namp = false;
+ } else {
+ return false;
+ }
+ }
+ i_a--;
+ }
+ while (i_b >= 0) {
+ const char ch_b = b[i_b];
+ if (isNonSeparator(ch_b)) {
+ if (may_be_namp && tryGetISODigit(ch_b) == 1) {
+ may_be_namp = false;
+ } else {
+ return false;
+ }
+ }
+ i_b--;
+ }
}
- // At least one string has matched completely;
- if (matched >= MIN_MATCH && (ia < 0 || ib < 0)) {
- return true;
- }
-
- /*
- * Now, what remains must be one of the following for a
- * match:
- *
- * - a '+' on one and a '00' or a '011' on the other
- * - a '0' on one and a (+,00)<country code> on the other
- * (for this, a '0' and a '00' prefix would have succeeded above)
- */
-
- if (matchIntlPrefix(a, ia + 1) && matchIntlPrefix(b, ib +1)) {
- return true;
- }
-
- if (matchTrunkPrefix(a, ia + 1) && matchIntlPrefixAndCC(b, ib +1)) {
- return true;
- }
-
- if (matchTrunkPrefix(b, ib + 1) && matchIntlPrefixAndCC(a, ia +1)) {
- return true;
- }
-
- /*
- * Last resort: if the number of unmatched characters on both sides is less than or equal
- * to the length of the longest country code and only one number starts with a + accept
- * the match. This is because some countries like France and Russia have an extra prefix
- * digit that is used when dialing locally in country that does not show up when you dial
- * the number using the country code. In France this prefix digit is used to determine
- * which land line carrier to route the call over.
- */
- bool aPlusFirst = (*a == '+');
- bool bPlusFirst = (*b == '+');
- if (ia < 4 && ib < 4 && (aPlusFirst || bPlusFirst) && !(aPlusFirst && bPlusFirst)) {
- return true;
- }
-
- return false;
+ return true;
}
} // namespace android
diff --git a/android/PhoneNumberUtils.h b/android/PhoneNumberUtils.h
index 1a5720f..8f350a7 100644
--- a/android/PhoneNumberUtils.h
+++ b/android/PhoneNumberUtils.h
@@ -19,7 +19,7 @@
#define _ANDROID_PHONE_NUMBER_UTILS_H
namespace android {
-
+
bool phone_number_compare(const char* a, const char* b);
}
diff --git a/android/PhoneNumberUtilsTest.cpp b/android/PhoneNumberUtilsTest.cpp
new file mode 100644
index 0000000..6772fe5
--- /dev/null
+++ b/android/PhoneNumberUtilsTest.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Note that similar (or almost same) tests exist in Java side (See
+ * DatabaseGeneralTest.java in AndroidTests). The differences are:
+ * - this test is quite easy to do (You can do it in your Unix PC)
+ * - this test is not automatically executed by build servers
+ *
+ * You should also execute the test before submitting this.
+ */
+
+#include "PhoneNumberUtils.h"
+
+#include <stdio.h>
+#include <string.h>
+
+using namespace android;
+
+#define EXPECT(function, input1, input2, expected, total, error) \
+ ({ \
+ const char *i1_cache = input1; \
+ const char *i2_cache = input2; \
+ (total)++; \
+ if ((expected) != (function)((i1_cache), (i2_cache))) { \
+ if (expected) { \
+ printf("%s != %s while we expect %s == %s\n", \
+ (i1_cache), (i2_cache), (i1_cache), (i2_cache)); \
+ } else { \
+ printf("%s == %s while we expect %s != %s\n", \
+ (i1_cache), (i2_cache), (i1_cache), (i2_cache)); \
+ } \
+ (error)++; \
+ } \
+ })
+
+#define EXPECT_EQ(input1, input2) \
+ EXPECT(phone_number_compare, (input1), (input2), true, \
+ (total), (error))
+
+
+#define EXPECT_NE(input1, input2) \
+ EXPECT(phone_number_compare, (input1), (input2), false, \
+ (total), (error))
+
+int main() {
+ int total = 0;
+ int error = 0;
+
+ EXPECT_EQ(NULL, NULL);
+ EXPECT_EQ("", NULL);
+ EXPECT_EQ(NULL, "");
+ EXPECT_EQ("", "");
+
+ EXPECT_EQ("999", "999");
+ EXPECT_EQ("119", "119");
+
+ EXPECT_NE("123456789", "923456789");
+ EXPECT_NE("123456789", "123456781");
+ EXPECT_NE("123456789", "1234567890");
+ EXPECT_NE("123456789", "0123456789");
+
+ // Google, Inc.
+ EXPECT_EQ("650-253-0000", "6502530000");
+ EXPECT_EQ("650-253-0000", "650 253 0000");
+ EXPECT_EQ("650 253 0000", "6502530000");
+
+ // trunk (NDD) prefix must be properly handled in US
+ EXPECT_EQ("650-253-0000", "1-650-253-0000");
+ EXPECT_EQ("650-253-0000", " 1-650-253-0000");
+ EXPECT_NE("650-253-0000", "11-650-253-0000");
+ EXPECT_NE("650-253-0000", "0-650-253-0000");
+
+ EXPECT_EQ("+1 650-253-0000", "6502530000");
+ EXPECT_EQ("001 650-253-0000", "6502530000");
+ EXPECT_EQ("0111 650-253-0000", "6502530000");
+
+ // Country code is different.
+ EXPECT_NE("+19012345678", "+819012345678");
+
+ // Russian trunk digit
+ EXPECT_EQ("+79161234567", "89161234567");
+
+ // French trunk digit
+ EXPECT_EQ("+33123456789", "0123456789");
+
+ // Trunk digit for city codes in the Netherlands
+ EXPECT_EQ("+31771234567", "0771234567");
+
+ // Japanese dial
+ EXPECT_EQ("090-1234-5678", "+819012345678");
+ EXPECT_EQ("090(1234)5678", "+819012345678");
+ EXPECT_EQ("090-1234-5678", "+81-90-1234-5678");
+
+ // Trunk prefix must not be ignored in Japan
+ EXPECT_NE("090-1234-5678", "90-1234-5678");
+
+ EXPECT_NE("090-1234-5678", "080-1234-5678");
+ EXPECT_NE("090-1234-5678", "190-1234-5678");
+ EXPECT_NE("090-1234-5678", "890-1234-5678");
+ EXPECT_NE("+81-90-1234-5678", "+81-090-1234-5678");
+
+ EXPECT_EQ("+593(800)123-1234", "8001231234");
+
+ // Two continuous 0 at the beginieng of the phone string should not be
+ // treated as trunk prefix.
+ EXPECT_NE("008001231234", "8001231234");
+
+ // Test broken caller ID seen on call from Thailand to the US
+ EXPECT_EQ("+66811234567", "166811234567");
+
+ // Confirm that the bug found before does not re-appear.
+ EXPECT_NE("080-1234-5678", "+819012345678");
+
+ // Currently we cannot get this test through (Japanese trunk prefix is 0,
+ // but there is no sensible way to know it now (as of 2009-6-12)...
+ // EXPECT_NE("290-1234-5678", "+819012345678");
+
+ printf("total: %d, error: %d\n\n", total, error);
+ if (error == 0) {
+ printf("Success!\n");
+ } else {
+ printf("Failure... :(\n");
+ }
+}
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
index 5f8781c..da5767f 100644
--- a/android/PhoneticStringUtils.cpp
+++ b/android/PhoneticStringUtils.cpp
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include "PhoneticStringUtils.h"
+#include <utils/String8.h>
// We'd like 0 length string last of sorted list. So when input string is NULL
// or 0 length string, we use these instead.
@@ -30,59 +31,158 @@
namespace android {
-int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
- if (src == NULL || len <= index) {
- return -1;
+// Get hiragana from halfwidth katakana.
+static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint,
+ char32_t next_codepoint,
+ bool *next_is_consumed) {
+ if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
+ return codepoint;
}
- if ((src[index] >> 7) == 0) {
- if (next != NULL) {
- *next = index + 1;
- }
- return src[index];
- }
- if ((src[index] & 64) == 0) {
- return -1;
- }
- int mask;
- size_t num_to_read;
- for (num_to_read = 1, mask = 64; // 01000000
- num_to_read < 7 && (src[index] & mask) == mask;
- num_to_read++, mask >>= 1) {
- }
- if (num_to_read == 7) {
- return -1;
- }
-
- if (num_to_read + index > len) {
- return -1;
- }
-
- {
- size_t i;
- for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
- mask = (mask << 1) + 1;
- }
- }
-
- int codepoint = mask & src[index];
-
- for (size_t i = 1; i < num_to_read; i++) {
- if ((src[i + index] & 192) != 128) { // must be 10xxxxxx
- return -1;
- }
- codepoint = (codepoint << 6) + (src[i + index] & 63);
- }
-
- if (next != NULL) {
- *next = index + num_to_read;
+ switch (codepoint) {
+ case 0xFF66: // wo
+ return 0x3092;
+ case 0xFF67: // xa
+ return 0x3041;
+ case 0xFF68: // xi
+ return 0x3043;
+ case 0xFF69: // xu
+ return 0x3045;
+ case 0xFF6A: // xe
+ return 0x3047;
+ case 0xFF6B: // xo
+ return 0x3049;
+ case 0xFF6C: // xya
+ return 0x3083;
+ case 0xFF6D: // xyu
+ return 0x3085;
+ case 0xFF6E: // xyo
+ return 0x3087;
+ case 0xFF6F: // xtsu
+ return 0x3063;
+ case 0xFF70: // -
+ return 0x30FC;
+ case 0xFF9C: // wa
+ return 0x308F;
+ case 0xFF9D: // n
+ return 0x3093;
+ break;
+ default: {
+ if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
+ // a, i, u, e, o
+ if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x3094; // vu
+ } else {
+ return 0x3042 + (codepoint - 0xFF71) * 2;
+ }
+ } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
+ // ka - chi
+ if (next_codepoint == 0xFF9E) {
+ // "dakuten" (voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x304B + (codepoint - 0xFF76) * 2 + 1;
+ } else {
+ return 0x304B + (codepoint - 0xFF76) * 2;
+ }
+ } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
+ // tsu, te, to (skip xtsu)
+ if (next_codepoint == 0xFF9E) {
+ // "dakuten" (voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x3064 + (codepoint - 0xFF82) * 2 + 1;
+ } else {
+ return 0x3064 + (codepoint - 0xFF82) * 2;
+ }
+ } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
+ // na, ni, nu, ne, no
+ return 0x306A + (codepoint - 0xFF85);
+ } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
+ // ha, hi, hu, he, ho
+ if (next_codepoint == 0xFF9E) {
+ // "dakuten" (voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x306F + (codepoint - 0xFF8A) * 3 + 1;
+ } else if (next_codepoint == 0xFF9F) {
+ // "han-dakuten" (half voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x306F + (codepoint - 0xFF8A) * 3 + 2;
+ } else {
+ return 0x306F + (codepoint - 0xFF8A) * 3;
+ }
+ } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
+ // ma, mi, mu, me, mo
+ return 0x307E + (codepoint - 0xFF8F);
+ } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
+ // ya, yu, yo
+ return 0x3084 + (codepoint - 0xFF94) * 2;
+ } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
+ // ra, ri, ru, re, ro
+ return 0x3089 + (codepoint - 0xFF97);
+ }
+ // Note: 0xFF9C, 0xFF9D are handled above
+ } // end of default
}
return codepoint;
}
-int GetPhoneticallySortableCodePoint(int codepoint,
- int next_codepoint,
+// Assuming input is hiragana, convert the hiragana to "normalized" hiragana.
+static int GetNormalizedHiragana(int codepoint) {
+ if (codepoint < 0x3040 || 0x309F < codepoint) {
+ return codepoint;
+ }
+
+ // TODO: should care (semi-)voiced mark (0x3099, 0x309A).
+
+ // Trivial kana conversions.
+ // e.g. xa => a
+ switch (codepoint) {
+ case 0x3041:
+ case 0x3043:
+ case 0x3045:
+ case 0x3047:
+ case 0x3049:
+ case 0x308E: // xwa
+ return codepoint + 1;
+ case 0x3095: // xka
+ return 0x304B;
+ case 0x3096: // xku
+ return 0x304F;
+ default:
+ return codepoint;
+ }
+}
+
+static int GetNormalizedKana(char32_t codepoint,
+ char32_t next_codepoint,
+ bool *next_is_consumed) {
+ // First, convert fullwidth katakana and halfwidth katakana to hiragana.
+ if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
+ // Make fullwidth katakana same as hiragana.
+ // 96 == 0x30A1 - 0x3041c
+ codepoint = codepoint - 96;
+ } else {
+ codepoint = GetHiraganaFromHalfwidthKatakana(
+ codepoint, next_codepoint, next_is_consumed);
+ }
+
+ // Normalize Hiragana.
+ return GetNormalizedHiragana(codepoint);
+}
+
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
if (next_is_consumed != NULL) {
*next_is_consumed = false;
@@ -149,208 +249,42 @@
// Below is Kana-related handling.
- // First, convert fullwidth katakana and halfwidth katakana to hiragana
- if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
- // Make fullwidth katakana same as hiragana.
- // 96 == 0x30A1 - 0x3041c
- codepoint = codepoint - 96;
- } else if (0xFF66 <= codepoint && codepoint <= 0xFF9F) {
- // Make halfwidth katakana same as hiragana
- switch (codepoint) {
- case 0xFF66: // wo
- codepoint = 0x3092;
- break;
- case 0xFF67: // xa
- codepoint = 0x3041;
- break;
- case 0xFF68: // xi
- codepoint = 0x3043;
- break;
- case 0xFF69: // xu
- codepoint = 0x3045;
- break;
- case 0xFF6A: // xe
- codepoint = 0x3047;
- break;
- case 0xFF6B: // xo
- codepoint = 0x3049;
- break;
- case 0xFF6C: // xya
- codepoint = 0x3083;
- break;
- case 0xFF6D: // xyu
- codepoint = 0x3085;
- break;
- case 0xFF6E: // xyo
- codepoint = 0x3087;
- break;
- case 0xFF6F: // xtsu
- codepoint = 0x3063;
- break;
- case 0xFF70: // -
- codepoint = 0x30FC;
- break;
- case 0xFF9C: // wa
- codepoint = 0x308F;
- break;
- case 0xFF9D: // n
- codepoint = 0x3093;
- break;
- default:
- {
- if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
- // a, i, u, e, o
- if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x3094; // vu
- } else {
- codepoint = 0x3042 + (codepoint - 0xFF71) * 2;
- }
- } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
- // ka - chi
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x304B + (codepoint - 0xFF76) * 2 + 1;
- } else {
- codepoint = 0x304B + (codepoint - 0xFF76) * 2;
- }
- } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
- // tsu, te, to (skip xtsu)
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x3064 + (codepoint - 0xFF82) * 2 + 1;
- } else {
- codepoint = 0x3064 + (codepoint - 0xFF82) * 2;
- }
- } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
- // na, ni, nu, ne, no
- codepoint = 0x306A + (codepoint - 0xFF85);
- } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
- // ha, hi, hu, he, ho
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 1;
- } else if (next_codepoint == 0xFF9F) {
- // "han-dakuten" (half voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 2;
- } else {
- codepoint = 0x306F + (codepoint - 0xFF8A) * 3;
- }
- } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
- // ma, mi, mu, me, mo
- codepoint = 0x307E + (codepoint - 0xFF8F);
- } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
- // ya, yu, yo
- codepoint = 0x3084 + (codepoint - 0xFF94) * 2;
- } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
- // ra, ri, ru, re, ro
- codepoint = 0x3089 + (codepoint - 0xFF97);
- }
- // Note: 0xFF9C, 0xFF9D are handled above
- } // end of default
- } // end of case
- }
-
- // Trivial kana conversions.
- // e.g. xa => a
- switch (codepoint) {
- case 0x3041:
- case 0x3043:
- case 0x3045:
- case 0x3047:
- case 0x3049:
- case 0x308E: // xwa
- codepoint++;
- break;
- case 0x3095: // xka
- codepoint = 0x304B;
- break;
- case 0x3096: // xku
- codepoint = 0x304F;
- break;
- }
-
- return codepoint;
+ return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
}
-bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
- if (codepoint < 128) { // 1 << 7
- if (*index >= len) {
- return false;
- }
- // 0xxxxxxx
- dst[*index] = static_cast<char>(codepoint);
- (*index)++;
- } else if (codepoint < 2048) { // 1 << (6 + 5)
- if (*index + 1 >= len) {
- return false;
- }
- // 110xxxxx
- dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 65536) { // 1 << (6 * 2 + 4)
- if (*index + 2 >= len) {
- return false;
- }
- // 1110xxxx
- dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 2097152) { // 1 << (6 * 3 + 3)
- if (*index + 3 >= len) {
- return false;
- }
- // 11110xxx
- dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 67108864) { // 1 << (6 * 2 + 2)
- if (*index + 4 >= len) {
- return false;
- }
- // 111110xx
- dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else {
- if (*index + 5 >= len) {
- return false;
- }
- // 1111110x
- dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
+int GetNormalizedCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
+ bool *next_is_consumed) {
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = false;
}
- return true;
+
+ if (codepoint <= 0x0020 || codepoint == 0x3000) {
+ // Whitespaces. Keep it as is.
+ return codepoint;
+ } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
+ (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
+ // Ascii and fullwidth ascii. Keep it as is
+ return codepoint;
+ } else if (codepoint == 0x02DC || codepoint == 0x223C) {
+ // tilde
+ return 0xFF5E;
+ } else if (codepoint <= 0x3040 ||
+ (0x3100 <= codepoint && codepoint < 0xFF00) ||
+ codepoint == CODEPOINT_FOR_NULL_STR) {
+ // Keep it as is.
+ return codepoint;
+ }
+
+ // Below is Kana-related handling.
+
+ return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
}
-bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len){
- if (dst == NULL || len == NULL) {
+static bool GetExpectedString(
+ const char *src, char **dst, size_t *dst_len,
+ int (*get_codepoint_function)(char32_t, char32_t, bool*)) {
+ if (dst == NULL || dst_len == NULL) {
return false;
}
@@ -358,100 +292,63 @@
src = STR_FOR_NULL_STR;
}
- size_t src_len = strlen(src);
- int codepoints[MAX_CODEPOINTS];
- size_t new_len = 0;
+ char32_t codepoints[MAX_CODEPOINTS];
- size_t codepoint_index;
- {
- int i, next;
- for (codepoint_index = 0, i = 0, next = 0;
- static_cast<size_t>(i) < src_len &&
- codepoint_index < MAX_CODEPOINTS;
- i = next) {
- int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
- if (codepoint <= 0) {
- return false;
- }
- int tmp_next;
- int next_codepoint = GetCodePointFromUtf8(src, src_len,
- next, &tmp_next);
- bool next_is_consumed = false;
-
- // It is ok even if next_codepoint is negative.
- codepoints[codepoint_index] =
- GetPhoneticallySortableCodePoint(codepoint,
- next_codepoint,
- &next_is_consumed);
- // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
- if (next_is_consumed) {
- next = tmp_next;
- }
-
- if (codepoints[codepoint_index] < 0) {
- // Do not increment codepoint_index.
- continue;
- }
-
- if (codepoints[codepoint_index] < 128) { // 1 << 7
- new_len++;
- } else if (codepoints[codepoint_index] < 2048) {
- // 1 << (6 + 5)
- new_len += 2;
- } else if (codepoints[codepoint_index] < 65536) {
- // 1 << (6 * 2 + 4)
- new_len += 3;
- } else if (codepoints[codepoint_index] < 2097152) {
- // 1 << (6 * 3 + 3)
- new_len += 4;
- } else if (codepoints[codepoint_index] < 67108864) {
- // 1 << (6 * 2 + 2)
- new_len += 5;
- } else {
- new_len += 6;
- }
-
- codepoint_index++;
+ size_t src_len = utf8_length(src);
+ if (src_len == 0) {
+ return false;
+ }
+ bool next_is_consumed;
+ size_t j = 0;
+ for (size_t i = 0; i < src_len;) {
+ int32_t ret = utf32_at(src, src_len, i, &i);
+ if (ret < 0) {
+ // failed to parse UTF-8
+ return false;
+ }
+ ret = get_codepoint_function(
+ static_cast<char32_t>(ret),
+ i + 1 < src_len ? codepoints[i + 1] : 0,
+ &next_is_consumed);
+ if (ret > 0) {
+ codepoints[j] = static_cast<char32_t>(ret);
+ j++;
+ }
+ if (next_is_consumed) {
+ i++;
}
}
+ size_t length = j;
- if (codepoint_index == 0) {
+ if (length == 0) {
// If all of codepoints are invalid, we place the string at the end of
// the list.
codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
- codepoint_index = 1;
- new_len = 4;
+ length = 1;
}
- new_len += 1; // For '\0'.
-
- *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
+ size_t new_len = utf8_length_from_utf32(codepoints, length);
+ *dst = static_cast<char *>(malloc(new_len + 1));
if (*dst == NULL) {
return false;
}
- size_t ch_index;
- {
- size_t i;
- for (i = 0, ch_index = 0; i < codepoint_index; i++) {
- if (!GetUtf8FromCodePoint(codepoints[i], *dst,
- new_len, &ch_index)) {
- free(*dst);
- *dst = NULL;
- return false;
- }
- }
- }
-
- if (ch_index != new_len - 1) {
+ if (utf32_to_utf8(codepoints, length, *dst, new_len + 1) != new_len) {
free(*dst);
*dst = NULL;
return false;
}
- (*dst)[new_len - 1] = '\0';
- *len = new_len;
+ *dst_len = new_len;
return true;
}
+bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len) {
+ return GetExpectedString(src, dst, len, GetPhoneticallySortableCodePoint);
+}
+
+bool GetNormalizedString(const char *src, char **dst, size_t *len) {
+ return GetExpectedString(src, dst, len, GetNormalizedCodePoint);
+}
+
} // namespace android
diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h
index 7ebf9e0..9da7d29 100644
--- a/android/PhoneticStringUtils.h
+++ b/android/PhoneticStringUtils.h
@@ -18,6 +18,7 @@
#define _ANDROID_PHONETIC_STRING_UTILS_H
#include <string.h> // For size_t.
+#include <utils/String8.h>
namespace android {
@@ -31,10 +32,23 @@
// is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
// when previous "codepoint" is appropriate). If the codepoint should not be
// considered when sorting (e.g. whitespaces), -1 is returned.
-int GetPhoneticallySortableCodePoint(int codepoint,
- int next_codepoint,
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed);
+// Returns codepoint which is "normalized", whose definition depends on each
+// Locale. Note that currently this function normalizes only Japanese; the
+// other characters are remained as is.
+// The variable "next_is_consumed" is set to true if "next_codepoint"
+// is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
+// when previous "codepoint" is appropriate, like half-width "ka").
+//
+// In Japanese, "normalized" means that half-width and full-width katakana is
+// appropriately converted to hiragana.
+int GetNormalizedCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
+ bool *next_is_consumed);
+
// Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful.
// If input is invalid or the length of the destination is not enough,
// returns false.
@@ -47,6 +61,13 @@
// Note that currently this function considers only Japanese.
bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len);
+// Creates a "normalized" Utf8 string and push it into "dst". *dst must be
+// freed after being used outside.
+// If "src" is NULL or its length is 0, "dst" is set to \uFFFF.
+//
+// Note that currently this function considers only Japanese.
+bool GetNormalizedString(const char *src, char **dst, size_t *len);
+
} // namespace android
#endif
diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp
index 0541007..356342e 100644
--- a/android/PhoneticStringUtilsTest.cpp
+++ b/android/PhoneticStringUtilsTest.cpp
@@ -20,6 +20,8 @@
#include <stdlib.h>
#include <string.h>
+#include <utils/String8.h>
+
using namespace android;
class TestExecutor {
@@ -29,13 +31,14 @@
private:
void DoOneTest(void (TestExecutor::*test)());
- void testGetCodePointFromUtf8();
+ void testUtf32At();
void testGetPhoneticallySortableCodePointAscii();
void testGetPhoneticallySortableCodePointKana();
void testGetPhoneticallySortableCodePointWhitespaceOnly();
void testGetPhoneticallySortableCodePointSimpleCompare();
- void testGetUtf8FromCodePoint();
+ void testGetUtf8FromUtf32();
void testGetPhoneticallySortableString();
+ void testGetNormalizedString();
// Note: When adding a test, do not forget to add it to DoOneTest().
@@ -64,13 +67,14 @@
bool TestExecutor::DoAllTests() {
- DoOneTest(&TestExecutor::testGetCodePointFromUtf8);
+ DoOneTest(&TestExecutor::testUtf32At);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly);
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
- DoOneTest(&TestExecutor::testGetUtf8FromCodePoint);
+ DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
+ DoOneTest(&TestExecutor::testGetNormalizedString);
printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
m_total_count, m_success_count, m_total_count - m_success_count);
@@ -90,36 +94,45 @@
m_success_count += m_success ? 1 : 0;
}
-void TestExecutor::testGetCodePointFromUtf8() {
- printf("testGetCodePointFromUtf8()\n");
- int next;
+#define TEST_GET_UTF32AT(src, index, expected_next, expected_value) \
+ ({ \
+ size_t next; \
+ int32_t ret = utf32_at(src, strlen(src), index, &next); \
+ if (ret < 0) { \
+ printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
+ (src), (index)); \
+ m_success = false; \
+ } else if (next != (expected_next)) { \
+ printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
+ (src), next, (expected_next)); \
+ } else { \
+ EXPECT_EQ_VALUE(ret, (expected_value)); \
+ } \
+ })
- EXPECT_EQ_VALUE(GetCodePointFromUtf8("a", 1, 0, &next), 97);
- EXPECT_EQ_VALUE(next, 1);
+void TestExecutor::testUtf32At() {
+ printf("testUtf32At()\n");
+
+ TEST_GET_UTF32AT("a", 0, 1, 97);
// Japanese hiragana "a"
- EXPECT_EQ_VALUE(GetCodePointFromUtf8("\xE3\x81\x82", 3, 0, &next), 0x3042);
- EXPECT_EQ_VALUE(next, 3);
+ TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
// Japanese fullwidth katakana "a" with ascii a
- EXPECT_EQ_VALUE(GetCodePointFromUtf8("a\xE3\x82\xA2", 4, 1, &next), 0x30A2);
- EXPECT_EQ_VALUE(next, 4);
+ TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
// 2 PUA
- ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
- 8, 0, &next), 0xFE000);
- ASSERT_EQ_VALUE(next, 4);
- ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
- 8, next, &next), 0xFE008);
- ASSERT_EQ_VALUE(next, 8);
+ TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
+ TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
}
void TestExecutor::testGetPhoneticallySortableCodePointAscii() {
printf("testGetPhoneticallySortableCodePoint()\n");
int halfwidth[94];
int fullwidth[94];
- int i, codepoint;
+ int i;
+ char32_t codepoint;
bool next_is_consumed;
for (i = 0, codepoint = 0x0021; codepoint <= 0x007E; ++i, ++codepoint) {
- halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+ halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
&next_is_consumed);
if (halfwidth[i] < 0) {
printf("returned value become negative at 0x%04X", codepoint);
@@ -133,7 +146,7 @@
}
}
for (i = 0, codepoint = 0xFF01; codepoint <= 0xFF5E; ++i, ++codepoint) {
- fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+ fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
&next_is_consumed);
if (fullwidth[i] < 0) {
printf("returned value become negative at 0x%04X", codepoint);
@@ -156,11 +169,12 @@
printf("testGetPhoneticallySortableCodePointKana()\n");
int hiragana[86];
int fullwidth_katakana[86];
- int i, codepoint;
+ int i;
+ char32_t codepoint;
bool next_is_consumed;
for (i = 0, codepoint = 0x3041; codepoint <= 0x3096; ++i, ++codepoint) {
- hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+ hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
&next_is_consumed);
if (hiragana[i] < 0) {
printf("returned value become negative at 0x%04X", codepoint);
@@ -175,7 +189,7 @@
}
for (i = 0, codepoint = 0x30A1; codepoint <= 0x30F6; ++i, ++codepoint) {
- fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
+ fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
&next_is_consumed);
if (fullwidth_katakana[i] < 0) {
printf("returned value become negative at 0x%04X", codepoint);
@@ -192,7 +206,7 @@
// hankaku-katakana space do not have some characters corresponding to
// zenkaku-hiragana (e.g. xwa, xka, xku). To make test easier, insert
// zenkaku-katakana version of them into this array (See the value 0x30??).
- int halfwidth_katakana[] = {
+ char32_t halfwidth_katakana[] = {
0xFF67, 0xFF71, 0xFF68, 0xFF72, 0xFF69, 0xFF73, 0xFF6A, 0xFF74, 0xFF6B,
0xFF75, 0xFF76, 0xFF76, 0xFF9E, 0xFF77, 0xFF77, 0xFF9E, 0xFF78, 0xFF78,
0xFF9E, 0xFF79, 0xFF79, 0xFF9E, 0xFF7A, 0xFF7A, 0xFF9E, 0xFF7B, 0xFF7B,
@@ -212,8 +226,8 @@
int j;
for (i = 0, j = 0; i < len && j < 86; ++i, ++j) {
- int codepoint = halfwidth_katakana[i];
- int next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : -1;
+ char32_t codepoint = halfwidth_katakana[i];
+ char32_t next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : 0;
halfwidth_katakana_result[j] =
GetPhoneticallySortableCodePoint(codepoint, next_codepoint,
&next_is_consumed);
@@ -232,7 +246,7 @@
}
void TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly() {
- printf("testGetPhoneticallySortableCodePointWhitespaceOnly");
+ printf("testGetPhoneticallySortableCodePointWhitespaceOnly()\n");
// Halfwidth space
int result = GetPhoneticallySortableCodePoint(0x0020, 0x0061, NULL);
ASSERT_EQ_VALUE(result, -1);
@@ -247,7 +261,7 @@
void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() {
printf("testGetPhoneticallySortableCodePointSimpleCompare()\n");
- int codepoints[] = {
+ char32_t codepoints[] = {
0x3042, 0x30AB, 0xFF7B, 0x305F, 0x30CA, 0xFF8A, 0x30D0, 0x3071,
0x307E, 0x30E4, 0xFF97, 0x308F, 0x3093, 0x3094, 'A', 'Z',
'0', '9', '!', '/', ':', '?', '[', '`', '{', '~'};
@@ -255,7 +269,7 @@
bool next_is_consumed;
for (size_t i = 0; i < len - 1; ++i) {
int codepoint_a =
- GetPhoneticallySortableCodePoint(codepoints[i], -1,
+ GetPhoneticallySortableCodePoint(codepoints[i], 0,
&next_is_consumed);
if (next_is_consumed) {
printf("next_is_consumed become true at 0x%04X", codepoint_a);
@@ -263,7 +277,7 @@
return;
}
int codepoint_b =
- GetPhoneticallySortableCodePoint(codepoints[i + 1], -1,
+ GetPhoneticallySortableCodePoint(codepoints[i + 1], 0,
&next_is_consumed);
if (next_is_consumed) {
printf("next_is_consumed become true at 0x%04X", codepoint_b);
@@ -280,20 +294,18 @@
}
}
-#define EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, i) \
+#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \
({ \
- index = i; \
- if (!GetUtf8FromCodePoint(codepoint, dst, 10, &index)) { \
+ char32_t codepoints[1] = {codepoint}; \
+ status_t ret = string8.setTo(codepoints, 1); \
+ if (ret != NO_ERROR) { \
printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
m_success = false; \
- } else if (index >= 10) { \
- printf("index (%d) >= 10\n", index); \
- m_success = false; \
} else { \
- dst[index] = '\0'; \
- if (strcmp(dst + i, expected) != 0) { \
+ const char* string = string8.string(); \
+ if (strcmp(string, expected) != 0) { \
printf("Failed at codepoint 0x%04X\n", codepoint); \
- for (const char *ch = dst; *ch != '\0'; ++ch) { \
+ for (const char *ch = string; *ch != '\0'; ++ch) { \
printf("0x%X ", *ch); \
} \
printf("!= "); \
@@ -306,14 +318,9 @@
} \
})
-#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \
- EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, 0)
-
-
-void TestExecutor::testGetUtf8FromCodePoint() {
- printf("testGetUtf8FromCodePoint()\n");
- size_t index = 0;
- char dst[10];
+void TestExecutor::testGetUtf8FromUtf32() {
+ printf("testGetUtf8FromUtf32()\n");
+ String8 string8;
EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
// Armenian capital letter AYB (2 bytes in UTF8)
@@ -325,15 +332,6 @@
// PUA (4 byets in UTF8)
EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
-
- EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(0x058F, "\xD6\x8F", 3);
-
- index = 0;
- if (GetUtf8FromCodePoint(0x3043, dst, 2, &index)) {
- printf("GetUtf8FromCodePont() returned true even when destination length"
- "is not enough\n");
- m_success = false;
- }
}
#define EXPECT_EQ_UTF8_UTF8(src, expected) \
@@ -358,6 +356,7 @@
})
void TestExecutor::testGetPhoneticallySortableString() {
+ printf("testGetPhoneticallySortableString()\n");
char *dst;
size_t len;
@@ -373,6 +372,49 @@
EXPECT_EQ_UTF8_UTF8(" \t", "\xF0\x9F\xBF\xBD");
}
+#undef EXPECT_EQ_UTF8_UTF8
+
+#define EXPECT_EQ_UTF8_UTF8(src, expected) \
+ ({ \
+ if (!GetNormalizedString(src, &dst, &len)) { \
+ printf("GetPhoneticallySortableString() returned false.\n"); \
+ m_success = false; \
+ } else { \
+ if (strcmp(dst, expected) != 0) { \
+ for (const char *ch = dst; *ch != '\0'; ++ch) { \
+ printf("0x%X ", *ch); \
+ } \
+ printf("!= "); \
+ for (const char *ch = expected; *ch != '\0'; ++ch) { \
+ printf("0x%X ", *ch); \
+ } \
+ printf("\n"); \
+ m_success = false; \
+ } \
+ free(dst); \
+ } \
+ })
+
+void TestExecutor::testGetNormalizedString() {
+ printf("testGetNormalizedString()\n");
+ char *dst;
+ size_t len;
+
+ // halfwidth alphabets/symbols -> keep it as is.
+ EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()",
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()");
+ EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/",
+ "abcdefghijklmnopqrstuvwxyz[]{}\\@/");
+
+ // halfwidth/fullwidth-katakana -> hiragana
+ EXPECT_EQ_UTF8_UTF8(
+ "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
+ "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
+
+ // whitespace -> keep it as is.
+ EXPECT_EQ_UTF8_UTF8(" \t", " \t");
+}
+
int main() {
TestExecutor executor;
if(executor.DoAllTests()) {
diff --git a/android/sqlite3_android.cpp b/android/sqlite3_android.cpp
index 27334ef..252a0c5 100644
--- a/android/sqlite3_android.cpp
+++ b/android/sqlite3_android.cpp
@@ -90,6 +90,25 @@
}
}
+static void get_normalized_string(
+ sqlite3_context * context, int argc, sqlite3_value ** argv)
+{
+ if (argc != 1) {
+ sqlite3_result_null(context);
+ return;
+ }
+ char const * src = (char const *)sqlite3_value_text(argv[0]);
+ char * ret;
+ size_t len;
+
+ if (!android::GetNormalizedString(src, &ret, &len)) {
+ // Probably broken string. Return 0 length string.
+ sqlite3_result_text(context, "", -1, SQLITE_STATIC);
+ } else {
+ sqlite3_result_text(context, ret, len, free);
+ }
+}
+
static void phone_numbers_equal(sqlite3_context * context, int argc, sqlite3_value ** argv)
{
if (argc != 2) {
@@ -161,7 +180,11 @@
sqlite3_result_null(context);
return;
}
-
+ if (strstr(path, "/../") != NULL) {
+ sqlite3_result_null(context);
+ return;
+ }
+
int err = unlink(path);
if (err != -1) {
// No error occured, return true
@@ -196,23 +219,53 @@
/**
* This function is invoked as:
*
- * _TOKENIZE('<token_table>', <data_row_id>, <data>, <delimiter>)
+ * _TOKENIZE('<token_table>', <data_row_id>, <data>, <delimiter>,
+ * <use_token_index>, <data_tag>)
*
- * It will then split data on each instance of delimiter and insert each token
- * into token_table's 'token' column with data_row_id in the 'source' column.
+ * If <use_token_index> is omitted, it is treated as 0.
+ * If <data_tag> is omitted, it is treated as NULL.
+ *
+ * It will split <data> on each instance of <delimiter> and insert each token
+ * into <token_table>. The following columns in <token_table> are used:
+ * token TEXT, source INTEGER, token_index INTEGER, tag (any type)
+ * The token_index column is not required if <use_token_index> is 0.
+ * The tag column is not required if <data_tag> is NULL.
+ *
+ * One row is inserted for each token in <data>.
+ * In each inserted row, 'source' is <data_row_id>.
+ * In the first inserted row, 'token' is the hex collation key of
+ * the entire <data> string, and 'token_index' is 0.
+ * In each row I (where 1 <= I < N, and N is the number of tokens in <data>)
+ * 'token' will be set to the hex collation key of the I:th token (0-based).
+ * If <use_token_index> != 0, 'token_index' is set to I.
+ * If <data_tag> is not NULL, 'tag' is set to <data_tag>.
+ *
+ * In other words, there will be one row for the entire string,
+ * and one row for each token except the first one.
+ *
* The function returns the number of tokens generated.
*/
static void tokenize(sqlite3_context * context, int argc, sqlite3_value ** argv)
{
//LOGD("enter tokenize");
int err;
+ int useTokenIndex = 0;
+ int useDataTag = 0;
- if (argc != 4) {
- LOGE("Tokenize requires 4 arguments");
+ if (!(argc >= 4 || argc <= 6)) {
+ LOGE("Tokenize requires 4 to 6 arguments");
sqlite3_result_null(context);
return;
}
+ if (argc > 4) {
+ useTokenIndex = sqlite3_value_int(argv[4]);
+ }
+
+ if (argc > 5) {
+ useDataTag = (sqlite3_value_type(argv[5]) != SQLITE_NULL);
+ }
+
sqlite3 * handle = sqlite3_context_db_handle(context);
UCollator* collator = (UCollator*)sqlite3_user_data(context);
char const * tokenTable = (char const *)sqlite3_value_text(argv[0]);
@@ -225,7 +278,12 @@
// Get or create the prepared statement for the insertions
sqlite3_stmt * statement = (sqlite3_stmt *)sqlite3_get_auxdata(context, 0);
if (!statement) {
- char * sql = sqlite3_mprintf("INSERT INTO %s (token, source) VALUES (?, ?);", tokenTable);
+ char const * tokenIndexCol = useTokenIndex ? ", token_index" : "";
+ char const * tokenIndexParam = useTokenIndex ? ", ?" : "";
+ char const * dataTagCol = useDataTag ? ", tag" : "";
+ char const * dataTagParam = useDataTag ? ", ?" : "";
+ char * sql = sqlite3_mprintf("INSERT INTO %s (token, source%s%s) VALUES (?, ?%s%s);",
+ tokenTable, tokenIndexCol, dataTagCol, tokenIndexParam, dataTagParam);
err = sqlite3_prepare_v2(handle, sql, -1, &statement, NULL);
sqlite3_free(sql);
if (err) {
@@ -251,6 +309,17 @@
return;
}
+ // Bind <data_tag> to the tag column
+ if (useDataTag) {
+ int dataTagParamIndex = useTokenIndex ? 4 : 3;
+ err = sqlite3_bind_value(statement, dataTagParamIndex, argv[5]);
+ if (err != SQLITE_OK) {
+ LOGE("bind failed");
+ sqlite3_result_null(context);
+ return;
+ }
+ }
+
// Get the raw bytes for the string to tokenize
// the string will be modified by following code
// however, sqlite did not reuse the string, so it is safe to not dup it
@@ -299,6 +368,15 @@
break;
}
+ if (useTokenIndex) {
+ err = sqlite3_bind_int(statement, 3, numTokens);
+ if (err != SQLITE_OK) {
+ LOGE(" sqlite3_bind_int error %d", err);
+ free(base16buf);
+ break;
+ }
+ }
+
err = sqlite3_step(statement);
free(base16buf);
@@ -357,7 +435,15 @@
err = sqlite3_create_function(handle, "_TOKENIZE", 4, SQLITE_UTF16, collator, tokenize, NULL, NULL);
if (err != SQLITE_OK) {
return err;
- }
+ }
+ err = sqlite3_create_function(handle, "_TOKENIZE", 5, SQLITE_UTF16, collator, tokenize, NULL, NULL);
+ if (err != SQLITE_OK) {
+ return err;
+ }
+ err = sqlite3_create_function(handle, "_TOKENIZE", 6, SQLITE_UTF16, collator, tokenize, NULL, NULL);
+ if (err != SQLITE_OK) {
+ return err;
+ }
return SQLITE_OK;
}
@@ -422,5 +508,15 @@
return err;
}
+ // Register the GET_NORMALIZED_STRING function
+ err = sqlite3_create_function(handle,
+ "GET_NORMALIZED_STRING",
+ 1, SQLITE_UTF8, NULL,
+ get_normalized_string,
+ NULL, NULL);
+ if (err != SQLITE_OK) {
+ return err;
+ }
+
return SQLITE_OK;
}
diff --git a/dist/Android.mk b/dist/Android.mk
index 431533f..0200276 100644
--- a/dist/Android.mk
+++ b/dist/Android.mk
@@ -29,7 +29,8 @@
LOCAL_C_INCLUDES += $(call include-path-for, system-core)/cutils
LOCAL_SHARED_LIBRARIES += liblog \
libicuuc \
- libicui18n
+ libicui18n \
+ libutils
# include android specific methods
LOCAL_WHOLE_STATIC_LIBRARIES := libsqlite3_android