| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "PhoneticStringUtils.h" |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include <utils/String8.h> |
| |
| using namespace android; |
| |
| class TestExecutor { |
| public: |
| TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {} |
| bool DoAllTests(); |
| private: |
| void DoOneTest(void (TestExecutor::*test)()); |
| |
| void testUtf32At(); |
| void testGetPhoneticallySortableCodePointAscii(); |
| void testGetPhoneticallySortableCodePointKana(); |
| void testGetPhoneticallySortableCodePointWhitespaceOnly(); |
| void testGetPhoneticallySortableCodePointSimpleCompare(); |
| void testGetUtf8FromUtf32(); |
| void testGetPhoneticallySortableString(); |
| void testGetNormalizedString(); |
| |
| // Note: When adding a test, do not forget to add it to DoOneTest(). |
| |
| int m_total_count; |
| int m_success_count; |
| |
| bool m_success; |
| }; |
| |
| #define ASSERT_EQ_VALUE(input, expected) \ |
| ({ \ |
| if ((expected) != (input)) { \ |
| printf("0x%X(result) != 0x%X(expected)\n", input, expected); \ |
| m_success = false; \ |
| return; \ |
| } \ |
| }) |
| |
| #define EXPECT_EQ_VALUE(input, expected) \ |
| ({ \ |
| if ((expected) != (input)) { \ |
| printf("0x%X(result) != 0x%X(expected)\n", input, expected); \ |
| m_success = false; \ |
| } \ |
| }) |
| |
| |
| bool TestExecutor::DoAllTests() { |
| DoOneTest(&TestExecutor::testUtf32At); |
| DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii); |
| DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana); |
| DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly); |
| DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare); |
| DoOneTest(&TestExecutor::testGetUtf8FromUtf32); |
| DoOneTest(&TestExecutor::testGetPhoneticallySortableString); |
| DoOneTest(&TestExecutor::testGetNormalizedString); |
| |
| printf("Test total: %d\nSuccess: %d\nFailure: %d\n", |
| m_total_count, m_success_count, m_total_count - m_success_count); |
| |
| bool success = m_total_count == m_success_count; |
| printf("\n%s\n", success ? "Success" : "Failure"); |
| |
| return success; |
| } |
| |
| void TestExecutor::DoOneTest(void (TestExecutor::*test)()) { |
| m_success = true; |
| |
| (this->*test)(); |
| |
| ++m_total_count; |
| m_success_count += m_success ? 1 : 0; |
| } |
| |
| #define TEST_GET_UTF32AT(src, index, expected_next, expected_value) \ |
| ({ \ |
| size_t next; \ |
| int32_t ret = utf32_at(src, strlen(src), index, &next); \ |
| if (ret < 0) { \ |
| printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \ |
| (src), (index)); \ |
| m_success = false; \ |
| } else if (next != (expected_next)) { \ |
| printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \ |
| (src), next, (expected_next)); \ |
| } else { \ |
| EXPECT_EQ_VALUE(ret, (expected_value)); \ |
| } \ |
| }) |
| |
| void TestExecutor::testUtf32At() { |
| printf("testUtf32At()\n"); |
| |
| TEST_GET_UTF32AT("a", 0, 1, 97); |
| // Japanese hiragana "a" |
| TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042); |
| // Japanese fullwidth katakana "a" with ascii a |
| TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2); |
| |
| // 2 PUA |
| TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000); |
| TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008); |
| } |
| |
| void TestExecutor::testGetPhoneticallySortableCodePointAscii() { |
| printf("testGetPhoneticallySortableCodePoint()\n"); |
| int halfwidth[94]; |
| int fullwidth[94]; |
| int i; |
| char32_t codepoint; |
| bool next_is_consumed; |
| for (i = 0, codepoint = 0x0021; codepoint <= 0x007E; ++i, ++codepoint) { |
| halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1, |
| &next_is_consumed); |
| if (halfwidth[i] < 0) { |
| printf("returned value become negative at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| if (next_is_consumed) { |
| printf("next_is_consumed become true at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| } |
| for (i = 0, codepoint = 0xFF01; codepoint <= 0xFF5E; ++i, ++codepoint) { |
| fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1, |
| &next_is_consumed); |
| if (fullwidth[i] < 0) { |
| printf("returned value become negative at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| if (next_is_consumed) { |
| printf("next_is_consumed become true at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| } |
| |
| for (i = 0; i < 94; i++) { |
| EXPECT_EQ_VALUE(halfwidth[i], fullwidth[i]); |
| } |
| } |
| |
| void TestExecutor::testGetPhoneticallySortableCodePointKana() { |
| printf("testGetPhoneticallySortableCodePointKana()\n"); |
| int hiragana[86]; |
| int fullwidth_katakana[86]; |
| int i; |
| char32_t codepoint; |
| bool next_is_consumed; |
| |
| for (i = 0, codepoint = 0x3041; codepoint <= 0x3096; ++i, ++codepoint) { |
| hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, -1, |
| &next_is_consumed); |
| if (hiragana[i] < 0) { |
| printf("returned value become negative at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| if (next_is_consumed) { |
| printf("next_is_consumed become true at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| } |
| |
| for (i = 0, codepoint = 0x30A1; codepoint <= 0x30F6; ++i, ++codepoint) { |
| fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, -1, |
| &next_is_consumed); |
| if (fullwidth_katakana[i] < 0) { |
| printf("returned value become negative at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| if (next_is_consumed) { |
| printf("next_is_consumed become true at 0x%04X", codepoint); |
| m_success = false; |
| return; |
| } |
| } |
| |
| // hankaku-katakana space do not have some characters corresponding to |
| // zenkaku-hiragana (e.g. xwa, xka, xku). To make test easier, insert |
| // zenkaku-katakana version of them into this array (See the value 0x30??). |
| int halfwidth_katakana[] = { |
| 0xFF67, 0xFF71, 0xFF68, 0xFF72, 0xFF69, 0xFF73, 0xFF6A, 0xFF74, 0xFF6B, |
| 0xFF75, 0xFF76, 0xFF76, 0xFF9E, 0xFF77, 0xFF77, 0xFF9E, 0xFF78, 0xFF78, |
| 0xFF9E, 0xFF79, 0xFF79, 0xFF9E, 0xFF7A, 0xFF7A, 0xFF9E, 0xFF7B, 0xFF7B, |
| 0xFF9E, 0xFF7C, 0xFF7C, 0xFF9E, 0xFF7D, 0xFF7D, 0xFF9E, 0xFF7E, 0xFF7E, |
| 0xFF9E, 0xFF7F, 0xFF7F, 0xFF9E, 0xFF80, 0xFF80, 0xFF9E, 0xFF81, 0xFF81, |
| 0xFF9E, 0xFF6F, 0xFF82, 0xFF82, 0xFF9E, 0xFF83, 0xFF83, 0xFF9E, 0xFF84, |
| 0xFF84, 0xFF9E, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8A, |
| 0xFF9E, 0xFF8A, 0xFF9F, 0xFF8B, 0xFF8B, 0xFF9E, 0xFF8B, 0xFF9F, 0xFF8C, |
| 0xFF8C, 0xFF9E, 0xFF8C, 0xFF9F, 0xFF8D, 0xFF8D, 0xFF9E, 0xFF8D, 0xFF9F, |
| 0xFF8E, 0xFF8E, 0xFF9E, 0xFF8E, 0xFF9F, 0xFF8F, 0xFF90, 0xFF91, 0xFF92, |
| 0xFF93, 0xFF6C, 0xFF94, 0xFF6D, 0xFF95, 0xFF6E, 0xFF96, 0xFF97, 0xFF98, |
| 0xFF99, 0xFF9A, 0xFF9B, 0x30EE, 0xFF9C, 0x30F0, 0x30F1, 0xFF66, 0xFF9D, |
| 0xFF73, 0xFF9E, 0x30F5, 0x30F6}; |
| int len = sizeof(halfwidth_katakana)/sizeof(int); |
| |
| int halfwidth_katakana_result[86]; |
| |
| int j; |
| for (i = 0, j = 0; i < len && j < 86; ++i, ++j) { |
| int codepoint = halfwidth_katakana[i]; |
| int next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : -1; |
| halfwidth_katakana_result[j] = |
| GetPhoneticallySortableCodePoint(codepoint, next_codepoint, |
| &next_is_consumed); |
| // Consume voiced mark/half-voiced mark. |
| if (next_is_consumed) { |
| ++i; |
| } |
| } |
| ASSERT_EQ_VALUE(i, len); |
| ASSERT_EQ_VALUE(j, 86); |
| |
| for (i = 0; i < 86; ++i) { |
| EXPECT_EQ_VALUE(fullwidth_katakana[i], hiragana[i]); |
| EXPECT_EQ_VALUE(halfwidth_katakana_result[i], hiragana[i]); |
| } |
| } |
| |
| void TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly() { |
| printf("testGetPhoneticallySortableCodePointWhitespaceOnly()\n"); |
| // Halfwidth space |
| int result = GetPhoneticallySortableCodePoint(0x0020, 0x0061, NULL); |
| ASSERT_EQ_VALUE(result, -1); |
| // Fullwidth space |
| result = GetPhoneticallySortableCodePoint(0x3000, 0x0062, NULL); |
| ASSERT_EQ_VALUE(result, -1); |
| // tab |
| result = GetPhoneticallySortableCodePoint(0x0009, 0x0062, NULL); |
| ASSERT_EQ_VALUE(result, -1); |
| } |
| |
| void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() { |
| printf("testGetPhoneticallySortableCodePointSimpleCompare()\n"); |
| |
| char32_t codepoints[] = { |
| 0x3042, 0x30AB, 0xFF7B, 0x305F, 0x30CA, 0xFF8A, 0x30D0, 0x3071, |
| 0x307E, 0x30E4, 0xFF97, 0x308F, 0x3093, 0x3094, 'A', 'Z', |
| '0', '9', '!', '/', ':', '?', '[', '`', '{', '~'}; |
| size_t len = sizeof(codepoints)/sizeof(int); |
| bool next_is_consumed; |
| for (size_t i = 0; i < len - 1; ++i) { |
| int codepoint_a = |
| GetPhoneticallySortableCodePoint(codepoints[i], -1, |
| &next_is_consumed); |
| if (next_is_consumed) { |
| printf("next_is_consumed become true at 0x%04X", codepoint_a); |
| m_success = false; |
| return; |
| } |
| int codepoint_b = |
| GetPhoneticallySortableCodePoint(codepoints[i + 1], -1, |
| &next_is_consumed); |
| if (next_is_consumed) { |
| printf("next_is_consumed become true at 0x%04X", codepoint_b); |
| m_success = false; |
| return; |
| } |
| |
| if (codepoint_a >= codepoint_b) { |
| printf("0x%04X (from 0x%04X) >= 0x%04X (from 0x%04X)\n", |
| codepoint_a, codepoints[i], codepoint_b, codepoints[i + 1]); |
| m_success = false; |
| return; |
| } |
| } |
| } |
| |
| #define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \ |
| ({ \ |
| char32_t codepoints[1] = {codepoint}; \ |
| status_t ret = string8.setTo(codepoints, 1); \ |
| if (ret != NO_ERROR) { \ |
| printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \ |
| m_success = false; \ |
| } else { \ |
| const char* string = string8.string(); \ |
| if (strcmp(string, expected) != 0) { \ |
| printf("Failed at codepoint 0x%04X\n", codepoint); \ |
| for (const char *ch = string; *ch != '\0'; ++ch) { \ |
| printf("0x%X ", *ch); \ |
| } \ |
| printf("!= "); \ |
| for (const char *ch = expected; *ch != '\0'; ++ch) { \ |
| printf("0x%X ", *ch); \ |
| } \ |
| printf("\n"); \ |
| m_success = false; \ |
| } \ |
| } \ |
| }) |
| |
| void TestExecutor::testGetUtf8FromUtf32() { |
| printf("testGetUtf8FromUtf32()\n"); |
| String8 string8; |
| |
| EXPECT_EQ_CODEPOINT_UTF8('a', "\x61"); |
| // Armenian capital letter AYB (2 bytes in UTF8) |
| EXPECT_EQ_CODEPOINT_UTF8(0x0530, "\xD4\xB0"); |
| // Japanese 'a' (3 bytes in UTF8) |
| EXPECT_EQ_CODEPOINT_UTF8(0x3042, "\xE3\x81\x82"); |
| // Kanji |
| EXPECT_EQ_CODEPOINT_UTF8(0x65E5, "\xE6\x97\xA5"); |
| // PUA (4 byets in UTF8) |
| EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96"); |
| EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2"); |
| } |
| |
| #define EXPECT_EQ_UTF8_UTF8(src, expected) \ |
| ({ \ |
| if (!GetPhoneticallySortableString(src, &dst, &len)) { \ |
| printf("GetPhoneticallySortableString() returned false.\n"); \ |
| m_success = false; \ |
| } else { \ |
| if (strcmp(dst, expected) != 0) { \ |
| for (const char *ch = dst; *ch != '\0'; ++ch) { \ |
| printf("0x%X ", *ch); \ |
| } \ |
| printf("!= "); \ |
| for (const char *ch = expected; *ch != '\0'; ++ch) { \ |
| printf("0x%X ", *ch); \ |
| } \ |
| printf("\n"); \ |
| m_success = false; \ |
| } \ |
| free(dst); \ |
| } \ |
| }) |
| |
| void TestExecutor::testGetPhoneticallySortableString() { |
| printf("testGetPhoneticallySortableString()\n"); |
| char *dst; |
| size_t len; |
| |
| // halfwidth alphabets -> fullwidth alphabets. |
| EXPECT_EQ_UTF8_UTF8("ABCD", |
| "\xEF\xBC\xA1\xEF\xBC\xA2\xEF\xBC\xA3\xEF\xBC\xA4"); |
| // halfwidth/fullwidth-katakana -> hiragana |
| EXPECT_EQ_UTF8_UTF8( |
| "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA", |
| "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A"); |
| |
| // whitespace -> string which should be placed at last |
| EXPECT_EQ_UTF8_UTF8(" \t", "\xF0\x9F\xBF\xBD"); |
| } |
| |
| #undef EXPECT_EQ_UTF8_UTF8 |
| |
| #define EXPECT_EQ_UTF8_UTF8(src, expected) \ |
| ({ \ |
| if (!GetNormalizedString(src, &dst, &len)) { \ |
| printf("GetPhoneticallySortableString() returned false.\n"); \ |
| m_success = false; \ |
| } else { \ |
| if (strcmp(dst, expected) != 0) { \ |
| for (const char *ch = dst; *ch != '\0'; ++ch) { \ |
| printf("0x%X ", *ch); \ |
| } \ |
| printf("!= "); \ |
| for (const char *ch = expected; *ch != '\0'; ++ch) { \ |
| printf("0x%X ", *ch); \ |
| } \ |
| printf("\n"); \ |
| m_success = false; \ |
| } \ |
| free(dst); \ |
| } \ |
| }) |
| |
| void TestExecutor::testGetNormalizedString() { |
| printf("testGetNormalizedString()\n"); |
| char *dst; |
| size_t len; |
| |
| // halfwidth alphabets/symbols -> keep it as is. |
| EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()", |
| "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()"); |
| EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/", |
| "abcdefghijklmnopqrstuvwxyz[]{}\\@/"); |
| |
| // halfwidth/fullwidth-katakana -> hiragana |
| EXPECT_EQ_UTF8_UTF8( |
| "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA", |
| "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A"); |
| |
| // whitespace -> keep it as is. |
| EXPECT_EQ_UTF8_UTF8(" \t", " \t"); |
| } |
| |
| int main() { |
| TestExecutor executor; |
| if(executor.DoAllTests()) { |
| return 0; |
| } else { |
| return 1; |
| } |
| } |