1 files changed, 148 insertions, 0 deletions
diff --git a/xbmc/utils/Utf8Utils.cpp b/xbmc/utils/Utf8Utils.cpp
new file mode 100644
index 0000000..a45002a
--- /dev/null
+++ b/xbmc/utils/Utf8Utils.cpp
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (C) 2013-2018 Team Kodi
+ *  This file is part of Kodi - https://kodi.tv
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ *  See LICENSES/README.md for more information.
+ */
+#include "Utf8Utils.h"
+CUtf8Utils::utf8CheckResult CUtf8Utils::checkStrForUtf8(const std::string& str)
+{
+  const char* const strC = str.c_str();
+  const size_t len = str.length();
+  size_t pos = 0;
+  bool isPlainAscii = true;
+  while (pos < len)
+  {
+    const size_t chrLen = SizeOfUtf8Char(strC + pos);
+    if (chrLen == 0)
+      return hiAscii; // non valid UTF-8 sequence
+    else if (chrLen > 1)
+      isPlainAscii = false;
+    pos += chrLen;
+  }
+  if (isPlainAscii)
+    return plainAscii; // only single-byte characters (valid for US-ASCII and for UTF-8)
+  return utf8string;   // valid UTF-8 with at least one valid UTF-8 multi-byte sequence
+}
+size_t CUtf8Utils::FindValidUtf8Char(const std::string& str, const size_t startPos /*= 0*/)
+{
+  const char* strC = str.c_str();
+  const size_t len = str.length();
+  size_t pos = startPos;
+  while (pos < len)
+  {
+    if (SizeOfUtf8Char(strC + pos))
+      return pos;
+    pos++;
+  }
+  return std::string::npos;
+}
+size_t CUtf8Utils::RFindValidUtf8Char(const std::string& str, const size_t startPos)
+{
+  const size_t len = str.length();
+  if (!len)
+    return std::string::npos;
+  const char* strC = str.c_str();
+  size_t pos = (startPos >= len) ? len - 1 : startPos;
+  while (pos < len)  // pos is unsigned, after zero pos becomes large then len
+  {
+    if (SizeOfUtf8Char(strC + pos))
+      return pos;
+    pos--;
+  }
+  return std::string::npos;
+}
+inline size_t CUtf8Utils::SizeOfUtf8Char(const std::string& str, const size_t charStart /*= 0*/)
+{
+  if (charStart >= str.length())
+    return std::string::npos;
+  return SizeOfUtf8Char(str.c_str() + charStart);
+}
+// must be used only internally in class!
+// str must be null-terminated
+inline size_t CUtf8Utils::SizeOfUtf8Char(const char* const str)
+{
+  if (!str)
+    return 0;
+  const unsigned char* const strU = (const unsigned char*)str;
+  const unsigned char chr = strU[0];
+  /* this is an implementation of http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G27506 */
+  /* U+0000 - U+007F in UTF-8 */
+  if (chr <= 0x7F)
+    return 1;
+  /* U+0080 - U+07FF in UTF-8 */                    /* binary representation and range */
+  if (chr >= 0xC2 && chr <= 0xDF                    /* C2=1100 0010 - DF=1101 1111 */
+      // as str is null terminated,
+      && ((strU[1] & 0xC0) == 0x80))  /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+    return 2;  // valid UTF-8 2 bytes sequence
+  /* U+0800 - U+0FFF in UTF-8 */
+  if (chr == 0xE0                                   /* E0=1110 0000 */
+      && (strU[1] & 0xE0) == 0xA0     /* E0=1110 0000, A0=1010 0000 - BF=1011 1111 */
+      && (strU[2] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+    return 3; // valid UTF-8 3 bytes sequence
+  /* U+1000 - U+CFFF in UTF-8 */
+  /* skip U+D000 - U+DFFF (handled later) */
+  /* U+E000 - U+FFFF in UTF-8 */
+  if (((chr >= 0xE1 && chr <= 0xEC)                 /* E1=1110 0001 - EC=1110 1100 */
+        || chr == 0xEE || chr == 0xEF)              /* EE=1110 1110 - EF=1110 1111 */
+        && (strU[1] & 0xC0) == 0x80   /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+        && (strU[2] & 0xC0) == 0x80)  /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+    return 3; // valid UTF-8 3 bytes sequence
+  /* U+D000 - U+D7FF in UTF-8 */
+  /* note: range U+D800 - U+DFFF is reserved and invalid */
+  if (chr == 0xED                                   /* ED=1110 1101 */
+      && (strU[1] & 0xE0) == 0x80     /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
+      && (strU[2] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+    return 3; // valid UTF-8 3 bytes sequence
+  /* U+10000 - U+3FFFF in UTF-8 */
+  if (chr == 0xF0                                   /* F0=1111 0000 */
+      && (strU[1] & 0xE0) == 0x80     /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
+      && strU[2] >= 0x90 && strU[2] <= 0xBF         /* 90=1001 0000 - BF=1011 1111 */
+      && (strU[3] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+    return 4; // valid UTF-8 4 bytes sequence
+  /* U+40000 - U+FFFFF in UTF-8 */
+  if (chr >= 0xF1 && chr <= 0xF3                    /* F1=1111 0001 - F3=1111 0011 */
+      && (strU[1] & 0xC0) == 0x80     /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+      && (strU[2] & 0xC0) == 0x80     /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+      && (strU[3] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+    return 4; // valid UTF-8 4 bytes sequence
+  /* U+100000 - U+10FFFF in UTF-8 */
+  if (chr == 0xF4                                   /* F4=1111 0100 */
+      && (strU[1] & 0xF0) == 0x80     /* F0=1111 0000, 80=1000 0000 - 8F=1000 1111 */
+      && (strU[2] & 0xC0) == 0x80     /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+      && (strU[3] & 0xC0) == 0x80)    /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
+    return 4; // valid UTF-8 4 bytes sequence
+  return 0; // invalid UTF-8 char sequence
+}

diff --git a/xbmc/utils/Utf8Utils.cpp b/xbmc/utils/Utf8Utils.cpp new file mode 100644 index 0000000..a45002a --- /dev/null +++ b/xbmc/utils/Utf8Utils.cpp
@@ -0,0 +1,148 @@
	1	/*
	2	* Copyright (C) 2013-2018 Team Kodi
	3	* This file is part of Kodi - https://kodi.tv
	4	*
	5	* SPDX-License-Identifier: GPL-2.0-or-later
	6	* See LICENSES/README.md for more information.
	7	*/
	8
	9	#include "Utf8Utils.h"
	10
	11
	12	CUtf8Utils::utf8CheckResult CUtf8Utils::checkStrForUtf8(const std::string& str)
	13	{
	14	const char* const strC = str.c_str();
	15	const size_t len = str.length();
	16	size_t pos = 0;
	17	bool isPlainAscii = true;
	18
	19	while (pos < len)
	20	{
	21	const size_t chrLen = SizeOfUtf8Char(strC + pos);
	22	if (chrLen == 0)
	23	return hiAscii; // non valid UTF-8 sequence
	24	else if (chrLen > 1)
	25	isPlainAscii = false;
	26
	27	pos += chrLen;
	28	}
	29
	30	if (isPlainAscii)
	31	return plainAscii; // only single-byte characters (valid for US-ASCII and for UTF-8)
	32
	33	return utf8string; // valid UTF-8 with at least one valid UTF-8 multi-byte sequence
	34	}
	35
	36
	37
	38	size_t CUtf8Utils::FindValidUtf8Char(const std::string& str, const size_t startPos /= 0/)
	39	{
	40	const char* strC = str.c_str();
	41	const size_t len = str.length();
	42
	43	size_t pos = startPos;
	44	while (pos < len)
	45	{
	46	if (SizeOfUtf8Char(strC + pos))
	47	return pos;
	48
	49	pos++;
	50	}
	51
	52	return std::string::npos;
	53	}
	54
	55	size_t CUtf8Utils::RFindValidUtf8Char(const std::string& str, const size_t startPos)
	56	{
	57	const size_t len = str.length();
	58	if (!len)
	59	return std::string::npos;
	60
	61	const char* strC = str.c_str();
	62	size_t pos = (startPos >= len) ? len - 1 : startPos;
	63	while (pos < len) // pos is unsigned, after zero pos becomes large then len
	64	{
	65	if (SizeOfUtf8Char(strC + pos))
	66	return pos;
	67
	68	pos--;
	69	}
	70
	71	return std::string::npos;
	72	}
	73
	74	inline size_t CUtf8Utils::SizeOfUtf8Char(const std::string& str, const size_t charStart /= 0/)
	75	{
	76	if (charStart >= str.length())
	77	return std::string::npos;
	78
	79	return SizeOfUtf8Char(str.c_str() + charStart);
	80	}
	81
	82	// must be used only internally in class!
	83	// str must be null-terminated
	84	inline size_t CUtf8Utils::SizeOfUtf8Char(const char* const str)
	85	{
	86	if (!str)
	87	return 0;
	88
	89	const unsigned char* const strU = (const unsigned char*)str;
	90	const unsigned char chr = strU[0];
	91
	92	/* this is an implementation of http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G27506 */
	93
	94	/* U+0000 - U+007F in UTF-8 */
	95	if (chr <= 0x7F)
	96	return 1;
	97
	98	/* U+0080 - U+07FF in UTF-8 / / binary representation and range */
	99	if (chr >= 0xC2 && chr <= 0xDF /* C2=1100 0010 - DF=1101 1111 */
	100	// as str is null terminated,
	101	&& ((strU[1] & 0xC0) == 0x80)) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	102	return 2; // valid UTF-8 2 bytes sequence
	103
	104	/* U+0800 - U+0FFF in UTF-8 */
	105	if (chr == 0xE0 /* E0=1110 0000 */
	106	&& (strU[1] & 0xE0) == 0xA0 /* E0=1110 0000, A0=1010 0000 - BF=1011 1111 */
	107	&& (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	108	return 3; // valid UTF-8 3 bytes sequence
	109
	110	/* U+1000 - U+CFFF in UTF-8 */
	111	/* skip U+D000 - U+DFFF (handled later) */
	112	/* U+E000 - U+FFFF in UTF-8 */
	113	if (((chr >= 0xE1 && chr <= 0xEC) /* E1=1110 0001 - EC=1110 1100 */
	114	\|\| chr == 0xEE \|\| chr == 0xEF) /* EE=1110 1110 - EF=1110 1111 */
	115	&& (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	116	&& (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	117	return 3; // valid UTF-8 3 bytes sequence
	118
	119	/* U+D000 - U+D7FF in UTF-8 */
	120	/* note: range U+D800 - U+DFFF is reserved and invalid */
	121	if (chr == 0xED /* ED=1110 1101 */
	122	&& (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
	123	&& (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	124	return 3; // valid UTF-8 3 bytes sequence
	125
	126	/* U+10000 - U+3FFFF in UTF-8 */
	127	if (chr == 0xF0 /* F0=1111 0000 */
	128	&& (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
	129	&& strU[2] >= 0x90 && strU[2] <= 0xBF /* 90=1001 0000 - BF=1011 1111 */
	130	&& (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	131	return 4; // valid UTF-8 4 bytes sequence
	132
	133	/* U+40000 - U+FFFFF in UTF-8 */
	134	if (chr >= 0xF1 && chr <= 0xF3 /* F1=1111 0001 - F3=1111 0011 */
	135	&& (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	136	&& (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	137	&& (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	138	return 4; // valid UTF-8 4 bytes sequence
	139
	140	/* U+100000 - U+10FFFF in UTF-8 */
	141	if (chr == 0xF4 /* F4=1111 0100 */
	142	&& (strU[1] & 0xF0) == 0x80 /* F0=1111 0000, 80=1000 0000 - 8F=1000 1111 */
	143	&& (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	144	&& (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
	145	return 4; // valid UTF-8 4 bytes sequence
	146
	147	return 0; // invalid UTF-8 char sequence
	148	}