// unicode_test.cc - Don Yang (uguu.org) // // 06/22/11 #include"unicode.h" #include"util.h" namespace { // Wrapper to ParseUtf8Char, calls the function with a string, and // verifies that it's fully parsed and returns the expected value. // Returns true on success. static bool CheckParseUtf8Char(const char *input, unsigned int expected) { const string input_str(input); string::const_iterator i = input_str.begin(); const unsigned int actual = ParseUtf8Char(input_str, &i); if( expected != actual ) { printf("ERROR: decoded values mismatched: %04X vs %04X", expected, actual); return false; } if( expected != 0xfffd ) { ++i; if( i != input_str.end() ) { puts("ERROR: string not fully parsed"); return false; } } return true; } // Test UTF-8 parsing static void TestParseUtf8() { CHECK(CheckParseUtf8Char("\xd0\x80", 1 << 10), "2 bytes"); CHECK(CheckParseUtf8Char("\xe8\x80\x80", 1 << 15), "3 bytes"); CHECK(CheckParseUtf8Char("\xf4\x80\x80\x80", 1 << 20), "4 bytes"); CHECK(CheckParseUtf8Char("\xfa\x80\x80\x80\x80", 1 << 25), "5 bytes"); CHECK(CheckParseUtf8Char("\xfd\x80\x80\x80\x80\x80", 1 << 30), "6 bytes"); CHECK(CheckParseUtf8Char("\xc0", 0xfffd), "bad 2 bytes"); CHECK(CheckParseUtf8Char("\xe0\x80", 0xfffd), "bad 3 bytes"); CHECK(CheckParseUtf8Char("\xf0\x80\x80", 0xfffd), "bad 4 bytes"); CHECK(CheckParseUtf8Char("\xf8\x80\x80\x80", 0xfffd), "bad 5 bytes"); CHECK(CheckParseUtf8Char("\xfc\x80\x80\x80\x80", 0xfffd), "bad 6 bytes"); CHECK(CheckParseUtf8Char("\xdf\xb7", 0x7f7), "2 bytes"); CHECK(CheckParseUtf8Char("\xe2\x98\x83", 0x2603), "3 bytes"); CHECK(CheckParseUtf8Char("\xf0\x90\x83\xa1", 0x100e1), "4 bytes"); CHECK(CheckParseUtf8Char("\xfa\x80\x80\x80\x80", 0x2000000), "5 bytes"); CHECK(CheckParseUtf8Char("\xfd\x80\x80\x80\x80\x80", 0x40000000), "6 bytes"); } // Try skipping to the next character static void TestNextChar() { string text = "a" "\xd0\x80" "\xe8\x80\x80" "\xf4\x80\x80\x80" "\xfa\x80\x80\x80\x80" "\xfd\x80\x80\x80\x80\x80" "z"; string::iterator i = text.begin(); #define CHECK_ITERATOR(expected_char, label) \ NextChar(&text, &i); \ CHECK(i != text.end(), label); \ CHECK(*i == expected_char, label) CHECK_ITERATOR('\xd0', "1 byte"); CHECK_ITERATOR('\xe8', "2 bytes"); CHECK_ITERATOR('\xf4', "3 bytes"); CHECK_ITERATOR('\xfa', "4 bytes"); CHECK_ITERATOR('\xfd', "5 bytes"); CHECK_ITERATOR('z', "6 bytes"); #undef CHECK_ITERATOR NextChar(&text, &i); CHECK(i == text.end(), "1 byte"); } } // namespace int main() { TestParseUtf8(); TestNextChar(); return 0; }