Merge korg/donut into korg/master
diff --git a/android/Android.mk b/android/Android.mk
index 9a6efc7..2ec4701 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -27,6 +27,6 @@
PhoneticStringUtils.cpp \
PhoneticStringUtilsTest.cpp
-LOCAL_MODULE_TAGS := tests optional
+LOCAL_MODULE_TAGS := optional
include $(BUILD_EXECUTABLE)
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
index 5f8781c..24b1647 100644
--- a/android/PhoneticStringUtils.cpp
+++ b/android/PhoneticStringUtils.cpp
@@ -81,6 +81,156 @@
return codepoint;
}
+// Get hiragana from halfwidth katakana.
+static int GetHiraganaFromHalfwidthKatakana(int codepoint,
+ int next_codepoint,
+ bool *next_is_consumed) {
+ if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
+ return codepoint;
+ }
+
+ switch (codepoint) {
+ case 0xFF66: // wo
+ return 0x3092;
+ case 0xFF67: // xa
+ return 0x3041;
+ case 0xFF68: // xi
+ return 0x3043;
+ case 0xFF69: // xu
+ return 0x3045;
+ case 0xFF6A: // xe
+ return 0x3047;
+ case 0xFF6B: // xo
+ return 0x3049;
+ case 0xFF6C: // xya
+ return 0x3083;
+ case 0xFF6D: // xyu
+ return 0x3085;
+ case 0xFF6E: // xyo
+ return 0x3087;
+ case 0xFF6F: // xtsu
+ return 0x3063;
+ case 0xFF70: // -
+ return 0x30FC;
+ case 0xFF9C: // wa
+ return 0x308F;
+ case 0xFF9D: // n
+ return 0x3093;
+ break;
+ default: {
+ if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
+ // a, i, u, e, o
+ if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x3094; // vu
+ } else {
+ return 0x3042 + (codepoint - 0xFF71) * 2;
+ }
+ } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
+ // ka - chi
+ if (next_codepoint == 0xFF9E) {
+ // "dakuten" (voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x304B + (codepoint - 0xFF76) * 2 + 1;
+ } else {
+ return 0x304B + (codepoint - 0xFF76) * 2;
+ }
+ } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
+ // tsu, te, to (skip xtsu)
+ if (next_codepoint == 0xFF9E) {
+ // "dakuten" (voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x3064 + (codepoint - 0xFF82) * 2 + 1;
+ } else {
+ return 0x3064 + (codepoint - 0xFF82) * 2;
+ }
+ } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
+ // na, ni, nu, ne, no
+ return 0x306A + (codepoint - 0xFF85);
+ } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
+ // ha, hi, hu, he, ho
+ if (next_codepoint == 0xFF9E) {
+ // "dakuten" (voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x306F + (codepoint - 0xFF8A) * 3 + 1;
+ } else if (next_codepoint == 0xFF9F) {
+ // "han-dakuten" (half voiced mark)
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = true;
+ }
+ return 0x306F + (codepoint - 0xFF8A) * 3 + 2;
+ } else {
+ return 0x306F + (codepoint - 0xFF8A) * 3;
+ }
+ } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
+ // ma, mi, mu, me, mo
+ return 0x307E + (codepoint - 0xFF8F);
+ } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
+ // ya, yu, yo
+ return 0x3084 + (codepoint - 0xFF94) * 2;
+ } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
+ // ra, ri, ru, re, ro
+ return 0x3089 + (codepoint - 0xFF97);
+ }
+ // Note: 0xFF9C, 0xFF9D are handled above
+ } // end of default
+ }
+
+ return codepoint;
+}
+
+// Assuming input is hiragana, convert the hiragana to "normalized" hiragana.
+static int GetNormalizedHiragana(int codepoint) {
+ if (codepoint < 0x3040 || 0x309F < codepoint) {
+ return codepoint;
+ }
+
+ // TODO: should care (semi-)voiced mark (0x3099, 0x309A).
+
+ // Trivial kana conversions.
+ // e.g. xa => a
+ switch (codepoint) {
+ case 0x3041:
+ case 0x3043:
+ case 0x3045:
+ case 0x3047:
+ case 0x3049:
+ case 0x308E: // xwa
+ return codepoint + 1;
+ case 0x3095: // xka
+ return 0x304B;
+ case 0x3096: // xku
+ return 0x304F;
+ default:
+ return codepoint;
+ }
+}
+
+static int GetNormalizedKana(int codepoint,
+ int next_codepoint,
+ bool *next_is_consumed) {
+ // First, convert fullwidth katakana and halfwidth katakana to hiragana.
+ if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
+ // Make fullwidth katakana same as hiragana.
+ // 96 == 0x30A1 - 0x3041c
+ codepoint = codepoint - 96;
+ } else {
+ codepoint = GetHiraganaFromHalfwidthKatakana(
+ codepoint, next_codepoint, next_is_consumed);
+ }
+
+ // Normalize Hiragana.
+ return GetNormalizedHiragana(codepoint);
+}
+
int GetPhoneticallySortableCodePoint(int codepoint,
int next_codepoint,
bool *next_is_consumed) {
@@ -149,144 +299,39 @@
// Below is Kana-related handling.
- // First, convert fullwidth katakana and halfwidth katakana to hiragana
- if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
- // Make fullwidth katakana same as hiragana.
- // 96 == 0x30A1 - 0x3041c
- codepoint = codepoint - 96;
- } else if (0xFF66 <= codepoint && codepoint <= 0xFF9F) {
- // Make halfwidth katakana same as hiragana
- switch (codepoint) {
- case 0xFF66: // wo
- codepoint = 0x3092;
- break;
- case 0xFF67: // xa
- codepoint = 0x3041;
- break;
- case 0xFF68: // xi
- codepoint = 0x3043;
- break;
- case 0xFF69: // xu
- codepoint = 0x3045;
- break;
- case 0xFF6A: // xe
- codepoint = 0x3047;
- break;
- case 0xFF6B: // xo
- codepoint = 0x3049;
- break;
- case 0xFF6C: // xya
- codepoint = 0x3083;
- break;
- case 0xFF6D: // xyu
- codepoint = 0x3085;
- break;
- case 0xFF6E: // xyo
- codepoint = 0x3087;
- break;
- case 0xFF6F: // xtsu
- codepoint = 0x3063;
- break;
- case 0xFF70: // -
- codepoint = 0x30FC;
- break;
- case 0xFF9C: // wa
- codepoint = 0x308F;
- break;
- case 0xFF9D: // n
- codepoint = 0x3093;
- break;
- default:
- {
- if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
- // a, i, u, e, o
- if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x3094; // vu
- } else {
- codepoint = 0x3042 + (codepoint - 0xFF71) * 2;
- }
- } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
- // ka - chi
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x304B + (codepoint - 0xFF76) * 2 + 1;
- } else {
- codepoint = 0x304B + (codepoint - 0xFF76) * 2;
- }
- } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
- // tsu, te, to (skip xtsu)
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x3064 + (codepoint - 0xFF82) * 2 + 1;
- } else {
- codepoint = 0x3064 + (codepoint - 0xFF82) * 2;
- }
- } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
- // na, ni, nu, ne, no
- codepoint = 0x306A + (codepoint - 0xFF85);
- } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
- // ha, hi, hu, he, ho
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 1;
- } else if (next_codepoint == 0xFF9F) {
- // "han-dakuten" (half voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 2;
- } else {
- codepoint = 0x306F + (codepoint - 0xFF8A) * 3;
- }
- } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
- // ma, mi, mu, me, mo
- codepoint = 0x307E + (codepoint - 0xFF8F);
- } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
- // ya, yu, yo
- codepoint = 0x3084 + (codepoint - 0xFF94) * 2;
- } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
- // ra, ri, ru, re, ro
- codepoint = 0x3089 + (codepoint - 0xFF97);
- }
- // Note: 0xFF9C, 0xFF9D are handled above
- } // end of default
- } // end of case
- }
-
- // Trivial kana conversions.
- // e.g. xa => a
- switch (codepoint) {
- case 0x3041:
- case 0x3043:
- case 0x3045:
- case 0x3047:
- case 0x3049:
- case 0x308E: // xwa
- codepoint++;
- break;
- case 0x3095: // xka
- codepoint = 0x304B;
- break;
- case 0x3096: // xku
- codepoint = 0x304F;
- break;
- }
-
- return codepoint;
+ return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
}
+int GetNormalizedCodePoint(int codepoint,
+ int next_codepoint,
+ bool *next_is_consumed) {
+ if (next_is_consumed != NULL) {
+ *next_is_consumed = false;
+ }
+
+ if (codepoint <= 0x0020 || codepoint == 0x3000) {
+ // Whitespaces. Keep it as is.
+ return codepoint;
+ } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
+ (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
+ // Ascii and fullwidth ascii. Keep it as is
+ return codepoint;
+ } else if (codepoint == 0x02DC || codepoint == 0x223C) {
+ // tilde
+ return 0xFF5E;
+ } else if (codepoint <= 0x3040 ||
+ (0x3100 <= codepoint && codepoint < 0xFF00) ||
+ codepoint == CODEPOINT_FOR_NULL_STR) {
+ // Keep it as is.
+ return codepoint;
+ }
+
+ // Below is Kana-related handling.
+
+ return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
+}
+
+
bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
if (codepoint < 128) { // 1 << 7
if (*index >= len) {
@@ -349,7 +394,9 @@
return true;
}
-bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len){
+static bool GetExpectedString(
+ const char *src, char **dst, size_t *len,
+ int (*get_codepoint_function)(int, int, bool*)) {
if (dst == NULL || len == NULL) {
return false;
}
@@ -380,9 +427,9 @@
// It is ok even if next_codepoint is negative.
codepoints[codepoint_index] =
- GetPhoneticallySortableCodePoint(codepoint,
- next_codepoint,
- &next_is_consumed);
+ get_codepoint_function(codepoint,
+ next_codepoint,
+ &next_is_consumed);
// dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
if (next_is_consumed) {
next = tmp_next;
@@ -454,4 +501,12 @@
return true;
}
+bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len) {
+ return GetExpectedString(src, dst, len, GetPhoneticallySortableCodePoint);
+}
+
+bool GetNormalizedString(const char *src, char **dst, size_t *len) {
+ return GetExpectedString(src, dst, len, GetNormalizedCodePoint);
+}
+
} // namespace android
diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h
index 7ebf9e0..68a4928 100644
--- a/android/PhoneticStringUtils.h
+++ b/android/PhoneticStringUtils.h
@@ -35,6 +35,19 @@
int next_codepoint,
bool *next_is_consumed);
+// Returns codepoint which is "normalized", whose definition depends on each
+// Locale. Note that currently this function normalizes only Japanese; the
+// other characters are remained as is.
+// The variable "next_is_consumed" is set to true if "next_codepoint"
+// is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
+// when previous "codepoint" is appropriate, like half-width "ka").
+//
+// In Japanese, "normalized" means that half-width and full-width katakana is
+// appropriately converted to hiragana.
+int GetNormalizedCodePoint(int codepoint,
+ int next_codepoint,
+ bool *next_is_consumed);
+
// Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful.
// If input is invalid or the length of the destination is not enough,
// returns false.
@@ -47,6 +60,13 @@
// Note that currently this function considers only Japanese.
bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len);
+// Creates a "normalized" Utf8 string and push it into "dst". *dst must be
+// freed after being used outside.
+// If "src" is NULL or its length is 0, "dst" is set to \uFFFF.
+//
+// Note that currently this function considers only Japanese.
+bool GetNormalizedString(const char *src, char **dst, size_t *len);
+
} // namespace android
#endif
diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp
index 0541007..e74f67f 100644
--- a/android/PhoneticStringUtilsTest.cpp
+++ b/android/PhoneticStringUtilsTest.cpp
@@ -36,6 +36,7 @@
void testGetPhoneticallySortableCodePointSimpleCompare();
void testGetUtf8FromCodePoint();
void testGetPhoneticallySortableString();
+ void testGetNormalizedString();
// Note: When adding a test, do not forget to add it to DoOneTest().
@@ -71,6 +72,7 @@
DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
DoOneTest(&TestExecutor::testGetUtf8FromCodePoint);
DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
+ DoOneTest(&TestExecutor::testGetNormalizedString);
printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
m_total_count, m_success_count, m_total_count - m_success_count);
@@ -232,7 +234,7 @@
}
void TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly() {
- printf("testGetPhoneticallySortableCodePointWhitespaceOnly");
+ printf("testGetPhoneticallySortableCodePointWhitespaceOnly()\n");
// Halfwidth space
int result = GetPhoneticallySortableCodePoint(0x0020, 0x0061, NULL);
ASSERT_EQ_VALUE(result, -1);
@@ -358,6 +360,7 @@
})
void TestExecutor::testGetPhoneticallySortableString() {
+ printf("testGetPhoneticallySortableString()\n");
char *dst;
size_t len;
@@ -373,6 +376,49 @@
EXPECT_EQ_UTF8_UTF8(" \t", "\xF0\x9F\xBF\xBD");
}
+#undef EXPECT_EQ_UTF8_UTF8
+
+#define EXPECT_EQ_UTF8_UTF8(src, expected) \
+ ({ \
+ if (!GetNormalizedString(src, &dst, &len)) { \
+ printf("GetPhoneticallySortableString() returned false.\n"); \
+ m_success = false; \
+ } else { \
+ if (strcmp(dst, expected) != 0) { \
+ for (const char *ch = dst; *ch != '\0'; ++ch) { \
+ printf("0x%X ", *ch); \
+ } \
+ printf("!= "); \
+ for (const char *ch = expected; *ch != '\0'; ++ch) { \
+ printf("0x%X ", *ch); \
+ } \
+ printf("\n"); \
+ m_success = false; \
+ } \
+ free(dst); \
+ } \
+ })
+
+void TestExecutor::testGetNormalizedString() {
+ printf("testGetNormalizedString()\n");
+ char *dst;
+ size_t len;
+
+ // halfwidth alphabets/symbols -> keep it as is.
+ EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()",
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()");
+ EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/",
+ "abcdefghijklmnopqrstuvwxyz[]{}\\@/");
+
+ // halfwidth/fullwidth-katakana -> hiragana
+ EXPECT_EQ_UTF8_UTF8(
+ "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
+ "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
+
+ // whitespace -> keep it as is.
+ EXPECT_EQ_UTF8_UTF8(" \t", " \t");
+}
+
int main() {
TestExecutor executor;
if(executor.DoAllTests()) {
diff --git a/android/sqlite3_android.cpp b/android/sqlite3_android.cpp
index 27334ef..252a0c5 100644
--- a/android/sqlite3_android.cpp
+++ b/android/sqlite3_android.cpp
@@ -90,6 +90,25 @@
}
}
+static void get_normalized_string(
+ sqlite3_context * context, int argc, sqlite3_value ** argv)
+{
+ if (argc != 1) {
+ sqlite3_result_null(context);
+ return;
+ }
+ char const * src = (char const *)sqlite3_value_text(argv[0]);
+ char * ret;
+ size_t len;
+
+ if (!android::GetNormalizedString(src, &ret, &len)) {
+ // Probably broken string. Return 0 length string.
+ sqlite3_result_text(context, "", -1, SQLITE_STATIC);
+ } else {
+ sqlite3_result_text(context, ret, len, free);
+ }
+}
+
static void phone_numbers_equal(sqlite3_context * context, int argc, sqlite3_value ** argv)
{
if (argc != 2) {
@@ -161,7 +180,11 @@
sqlite3_result_null(context);
return;
}
-
+ if (strstr(path, "/../") != NULL) {
+ sqlite3_result_null(context);
+ return;
+ }
+
int err = unlink(path);
if (err != -1) {
// No error occured, return true
@@ -196,23 +219,53 @@
/**
* This function is invoked as:
*
- * _TOKENIZE('<token_table>', <data_row_id>, <data>, <delimiter>)
+ * _TOKENIZE('<token_table>', <data_row_id>, <data>, <delimiter>,
+ * <use_token_index>, <data_tag>)
*
- * It will then split data on each instance of delimiter and insert each token
- * into token_table's 'token' column with data_row_id in the 'source' column.
+ * If <use_token_index> is omitted, it is treated as 0.
+ * If <data_tag> is omitted, it is treated as NULL.
+ *
+ * It will split <data> on each instance of <delimiter> and insert each token
+ * into <token_table>. The following columns in <token_table> are used:
+ * token TEXT, source INTEGER, token_index INTEGER, tag (any type)
+ * The token_index column is not required if <use_token_index> is 0.
+ * The tag column is not required if <data_tag> is NULL.
+ *
+ * One row is inserted for each token in <data>.
+ * In each inserted row, 'source' is <data_row_id>.
+ * In the first inserted row, 'token' is the hex collation key of
+ * the entire <data> string, and 'token_index' is 0.
+ * In each row I (where 1 <= I < N, and N is the number of tokens in <data>)
+ * 'token' will be set to the hex collation key of the I:th token (0-based).
+ * If <use_token_index> != 0, 'token_index' is set to I.
+ * If <data_tag> is not NULL, 'tag' is set to <data_tag>.
+ *
+ * In other words, there will be one row for the entire string,
+ * and one row for each token except the first one.
+ *
* The function returns the number of tokens generated.
*/
static void tokenize(sqlite3_context * context, int argc, sqlite3_value ** argv)
{
//LOGD("enter tokenize");
int err;
+ int useTokenIndex = 0;
+ int useDataTag = 0;
- if (argc != 4) {
- LOGE("Tokenize requires 4 arguments");
+ if (!(argc >= 4 || argc <= 6)) {
+ LOGE("Tokenize requires 4 to 6 arguments");
sqlite3_result_null(context);
return;
}
+ if (argc > 4) {
+ useTokenIndex = sqlite3_value_int(argv[4]);
+ }
+
+ if (argc > 5) {
+ useDataTag = (sqlite3_value_type(argv[5]) != SQLITE_NULL);
+ }
+
sqlite3 * handle = sqlite3_context_db_handle(context);
UCollator* collator = (UCollator*)sqlite3_user_data(context);
char const * tokenTable = (char const *)sqlite3_value_text(argv[0]);
@@ -225,7 +278,12 @@
// Get or create the prepared statement for the insertions
sqlite3_stmt * statement = (sqlite3_stmt *)sqlite3_get_auxdata(context, 0);
if (!statement) {
- char * sql = sqlite3_mprintf("INSERT INTO %s (token, source) VALUES (?, ?);", tokenTable);
+ char const * tokenIndexCol = useTokenIndex ? ", token_index" : "";
+ char const * tokenIndexParam = useTokenIndex ? ", ?" : "";
+ char const * dataTagCol = useDataTag ? ", tag" : "";
+ char const * dataTagParam = useDataTag ? ", ?" : "";
+ char * sql = sqlite3_mprintf("INSERT INTO %s (token, source%s%s) VALUES (?, ?%s%s);",
+ tokenTable, tokenIndexCol, dataTagCol, tokenIndexParam, dataTagParam);
err = sqlite3_prepare_v2(handle, sql, -1, &statement, NULL);
sqlite3_free(sql);
if (err) {
@@ -251,6 +309,17 @@
return;
}
+ // Bind <data_tag> to the tag column
+ if (useDataTag) {
+ int dataTagParamIndex = useTokenIndex ? 4 : 3;
+ err = sqlite3_bind_value(statement, dataTagParamIndex, argv[5]);
+ if (err != SQLITE_OK) {
+ LOGE("bind failed");
+ sqlite3_result_null(context);
+ return;
+ }
+ }
+
// Get the raw bytes for the string to tokenize
// the string will be modified by following code
// however, sqlite did not reuse the string, so it is safe to not dup it
@@ -299,6 +368,15 @@
break;
}
+ if (useTokenIndex) {
+ err = sqlite3_bind_int(statement, 3, numTokens);
+ if (err != SQLITE_OK) {
+ LOGE(" sqlite3_bind_int error %d", err);
+ free(base16buf);
+ break;
+ }
+ }
+
err = sqlite3_step(statement);
free(base16buf);
@@ -357,7 +435,15 @@
err = sqlite3_create_function(handle, "_TOKENIZE", 4, SQLITE_UTF16, collator, tokenize, NULL, NULL);
if (err != SQLITE_OK) {
return err;
- }
+ }
+ err = sqlite3_create_function(handle, "_TOKENIZE", 5, SQLITE_UTF16, collator, tokenize, NULL, NULL);
+ if (err != SQLITE_OK) {
+ return err;
+ }
+ err = sqlite3_create_function(handle, "_TOKENIZE", 6, SQLITE_UTF16, collator, tokenize, NULL, NULL);
+ if (err != SQLITE_OK) {
+ return err;
+ }
return SQLITE_OK;
}
@@ -422,5 +508,15 @@
return err;
}
+ // Register the GET_NORMALIZED_STRING function
+ err = sqlite3_create_function(handle,
+ "GET_NORMALIZED_STRING",
+ 1, SQLITE_UTF8, NULL,
+ get_normalized_string,
+ NULL, NULL);
+ if (err != SQLITE_OK) {
+ return err;
+ }
+
return SQLITE_OK;
}