diff options
Diffstat (limited to 'xbmc/utils/Utf8Utils.cpp')
| -rw-r--r-- | xbmc/utils/Utf8Utils.cpp | 148 |
1 files changed, 148 insertions, 0 deletions
diff --git a/xbmc/utils/Utf8Utils.cpp b/xbmc/utils/Utf8Utils.cpp new file mode 100644 index 0000000..a45002a --- /dev/null +++ b/xbmc/utils/Utf8Utils.cpp | |||
| @@ -0,0 +1,148 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2013-2018 Team Kodi | ||
| 3 | * This file is part of Kodi - https://kodi.tv | ||
| 4 | * | ||
| 5 | * SPDX-License-Identifier: GPL-2.0-or-later | ||
| 6 | * See LICENSES/README.md for more information. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include "Utf8Utils.h" | ||
| 10 | |||
| 11 | |||
| 12 | CUtf8Utils::utf8CheckResult CUtf8Utils::checkStrForUtf8(const std::string& str) | ||
| 13 | { | ||
| 14 | const char* const strC = str.c_str(); | ||
| 15 | const size_t len = str.length(); | ||
| 16 | size_t pos = 0; | ||
| 17 | bool isPlainAscii = true; | ||
| 18 | |||
| 19 | while (pos < len) | ||
| 20 | { | ||
| 21 | const size_t chrLen = SizeOfUtf8Char(strC + pos); | ||
| 22 | if (chrLen == 0) | ||
| 23 | return hiAscii; // non valid UTF-8 sequence | ||
| 24 | else if (chrLen > 1) | ||
| 25 | isPlainAscii = false; | ||
| 26 | |||
| 27 | pos += chrLen; | ||
| 28 | } | ||
| 29 | |||
| 30 | if (isPlainAscii) | ||
| 31 | return plainAscii; // only single-byte characters (valid for US-ASCII and for UTF-8) | ||
| 32 | |||
| 33 | return utf8string; // valid UTF-8 with at least one valid UTF-8 multi-byte sequence | ||
| 34 | } | ||
| 35 | |||
| 36 | |||
| 37 | |||
| 38 | size_t CUtf8Utils::FindValidUtf8Char(const std::string& str, const size_t startPos /*= 0*/) | ||
| 39 | { | ||
| 40 | const char* strC = str.c_str(); | ||
| 41 | const size_t len = str.length(); | ||
| 42 | |||
| 43 | size_t pos = startPos; | ||
| 44 | while (pos < len) | ||
| 45 | { | ||
| 46 | if (SizeOfUtf8Char(strC + pos)) | ||
| 47 | return pos; | ||
| 48 | |||
| 49 | pos++; | ||
| 50 | } | ||
| 51 | |||
| 52 | return std::string::npos; | ||
| 53 | } | ||
| 54 | |||
| 55 | size_t CUtf8Utils::RFindValidUtf8Char(const std::string& str, const size_t startPos) | ||
| 56 | { | ||
| 57 | const size_t len = str.length(); | ||
| 58 | if (!len) | ||
| 59 | return std::string::npos; | ||
| 60 | |||
| 61 | const char* strC = str.c_str(); | ||
| 62 | size_t pos = (startPos >= len) ? len - 1 : startPos; | ||
| 63 | while (pos < len) // pos is unsigned, after zero pos becomes large then len | ||
| 64 | { | ||
| 65 | if (SizeOfUtf8Char(strC + pos)) | ||
| 66 | return pos; | ||
| 67 | |||
| 68 | pos--; | ||
| 69 | } | ||
| 70 | |||
| 71 | return std::string::npos; | ||
| 72 | } | ||
| 73 | |||
| 74 | inline size_t CUtf8Utils::SizeOfUtf8Char(const std::string& str, const size_t charStart /*= 0*/) | ||
| 75 | { | ||
| 76 | if (charStart >= str.length()) | ||
| 77 | return std::string::npos; | ||
| 78 | |||
| 79 | return SizeOfUtf8Char(str.c_str() + charStart); | ||
| 80 | } | ||
| 81 | |||
| 82 | // must be used only internally in class! | ||
| 83 | // str must be null-terminated | ||
| 84 | inline size_t CUtf8Utils::SizeOfUtf8Char(const char* const str) | ||
| 85 | { | ||
| 86 | if (!str) | ||
| 87 | return 0; | ||
| 88 | |||
| 89 | const unsigned char* const strU = (const unsigned char*)str; | ||
| 90 | const unsigned char chr = strU[0]; | ||
| 91 | |||
| 92 | /* this is an implementation of http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G27506 */ | ||
| 93 | |||
| 94 | /* U+0000 - U+007F in UTF-8 */ | ||
| 95 | if (chr <= 0x7F) | ||
| 96 | return 1; | ||
| 97 | |||
| 98 | /* U+0080 - U+07FF in UTF-8 */ /* binary representation and range */ | ||
| 99 | if (chr >= 0xC2 && chr <= 0xDF /* C2=1100 0010 - DF=1101 1111 */ | ||
| 100 | // as str is null terminated, | ||
| 101 | && ((strU[1] & 0xC0) == 0x80)) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 102 | return 2; // valid UTF-8 2 bytes sequence | ||
| 103 | |||
| 104 | /* U+0800 - U+0FFF in UTF-8 */ | ||
| 105 | if (chr == 0xE0 /* E0=1110 0000 */ | ||
| 106 | && (strU[1] & 0xE0) == 0xA0 /* E0=1110 0000, A0=1010 0000 - BF=1011 1111 */ | ||
| 107 | && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 108 | return 3; // valid UTF-8 3 bytes sequence | ||
| 109 | |||
| 110 | /* U+1000 - U+CFFF in UTF-8 */ | ||
| 111 | /* skip U+D000 - U+DFFF (handled later) */ | ||
| 112 | /* U+E000 - U+FFFF in UTF-8 */ | ||
| 113 | if (((chr >= 0xE1 && chr <= 0xEC) /* E1=1110 0001 - EC=1110 1100 */ | ||
| 114 | || chr == 0xEE || chr == 0xEF) /* EE=1110 1110 - EF=1110 1111 */ | ||
| 115 | && (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 116 | && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 117 | return 3; // valid UTF-8 3 bytes sequence | ||
| 118 | |||
| 119 | /* U+D000 - U+D7FF in UTF-8 */ | ||
| 120 | /* note: range U+D800 - U+DFFF is reserved and invalid */ | ||
| 121 | if (chr == 0xED /* ED=1110 1101 */ | ||
| 122 | && (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */ | ||
| 123 | && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 124 | return 3; // valid UTF-8 3 bytes sequence | ||
| 125 | |||
| 126 | /* U+10000 - U+3FFFF in UTF-8 */ | ||
| 127 | if (chr == 0xF0 /* F0=1111 0000 */ | ||
| 128 | && (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */ | ||
| 129 | && strU[2] >= 0x90 && strU[2] <= 0xBF /* 90=1001 0000 - BF=1011 1111 */ | ||
| 130 | && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 131 | return 4; // valid UTF-8 4 bytes sequence | ||
| 132 | |||
| 133 | /* U+40000 - U+FFFFF in UTF-8 */ | ||
| 134 | if (chr >= 0xF1 && chr <= 0xF3 /* F1=1111 0001 - F3=1111 0011 */ | ||
| 135 | && (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 136 | && (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 137 | && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 138 | return 4; // valid UTF-8 4 bytes sequence | ||
| 139 | |||
| 140 | /* U+100000 - U+10FFFF in UTF-8 */ | ||
| 141 | if (chr == 0xF4 /* F4=1111 0100 */ | ||
| 142 | && (strU[1] & 0xF0) == 0x80 /* F0=1111 0000, 80=1000 0000 - 8F=1000 1111 */ | ||
| 143 | && (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 144 | && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */ | ||
| 145 | return 4; // valid UTF-8 4 bytes sequence | ||
| 146 | |||
| 147 | return 0; // invalid UTF-8 char sequence | ||
| 148 | } | ||
