diff options
| author | manuel <manuel@mausz.at> | 2020-10-19 00:52:24 +0200 |
|---|---|---|
| committer | manuel <manuel@mausz.at> | 2020-10-19 00:52:24 +0200 |
| commit | be933ef2241d79558f91796cc5b3a161f72ebf9c (patch) | |
| tree | fe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/CharsetDetection.cpp | |
| parent | 5f8335c1e49ce108ef3481863833c98efa00411b (diff) | |
| download | kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2 kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip | |
sync with upstream
Diffstat (limited to 'xbmc/utils/CharsetDetection.cpp')
| -rw-r--r-- | xbmc/utils/CharsetDetection.cpp | 639 |
1 files changed, 639 insertions, 0 deletions
diff --git a/xbmc/utils/CharsetDetection.cpp b/xbmc/utils/CharsetDetection.cpp new file mode 100644 index 0000000..06a0416 --- /dev/null +++ b/xbmc/utils/CharsetDetection.cpp | |||
| @@ -0,0 +1,639 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2013-2018 Team Kodi | ||
| 3 | * This file is part of Kodi - https://kodi.tv | ||
| 4 | * | ||
| 5 | * SPDX-License-Identifier: GPL-2.0-or-later | ||
| 6 | * See LICENSES/README.md for more information. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include "CharsetDetection.h" | ||
| 10 | |||
| 11 | #include "LangInfo.h" | ||
| 12 | #include "utils/CharsetConverter.h" | ||
| 13 | #include "utils/StringUtils.h" | ||
| 14 | #include "utils/Utf8Utils.h" | ||
| 15 | #include "utils/log.h" | ||
| 16 | |||
| 17 | #include <algorithm> | ||
| 18 | |||
| 19 | /* XML declaration can be virtually any size (with many-many whitespaces) | ||
| 20 | * but for in real world we don't need to process megabytes of data | ||
| 21 | * so limit search for XML declaration to reasonable value */ | ||
| 22 | const size_t CCharsetDetection::m_XmlDeclarationMaxLength = 250; | ||
| 23 | |||
| 24 | /* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#charset | ||
| 25 | * encoding must be placed in first 1024 bytes of document */ | ||
| 26 | const size_t CCharsetDetection::m_HtmlCharsetEndSearchPos = 1024; | ||
| 27 | |||
| 28 | /* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#space-character | ||
| 29 | * tab, LF, FF, CR or space can be used as whitespace */ | ||
| 30 | const std::string CCharsetDetection::m_HtmlWhitespaceChars("\x09\x0A\x0C\x0D\x20"); // tab, LF, FF, CR and space | ||
| 31 | |||
| 32 | std::string CCharsetDetection::GetBomEncoding(const char* const content, const size_t contentLength) | ||
| 33 | { | ||
| 34 | if (contentLength < 2) | ||
| 35 | return ""; | ||
| 36 | if (content[0] == (char)0xFE && content[1] == (char)0xFF) | ||
| 37 | return "UTF-16BE"; | ||
| 38 | if (contentLength >= 4 && content[0] == (char)0xFF && content[1] == (char)0xFE && content[2] == (char)0x00 && content[3] == (char)0x00) | ||
| 39 | return "UTF-32LE"; /* first two bytes are same for UTF-16LE and UTF-32LE, so first check for full UTF-32LE mark */ | ||
| 40 | if (content[0] == (char)0xFF && content[1] == (char)0xFE) | ||
| 41 | return "UTF-16LE"; | ||
| 42 | if (contentLength < 3) | ||
| 43 | return ""; | ||
| 44 | if (content[0] == (char)0xEF && content[1] == (char)0xBB && content[2] == (char)0xBF) | ||
| 45 | return "UTF-8"; | ||
| 46 | if (contentLength < 4) | ||
| 47 | return ""; | ||
| 48 | if (content[0] == (char)0x00 && content[1] == (char)0x00 && content[2] == (char)0xFE && content[3] == (char)0xFF) | ||
| 49 | return "UTF-32BE"; | ||
| 50 | if (contentLength >= 5 && content[0] == (char)0x2B && content[1] == (char)0x2F && content[2] == (char)0x76 && | ||
| 51 | (content[4] == (char)0x32 || content[4] == (char)0x39 || content[4] == (char)0x2B || content[4] == (char)0x2F)) | ||
| 52 | return "UTF-7"; | ||
| 53 | if (content[0] == (char)0x84 && content[1] == (char)0x31 && content[2] == (char)0x95 && content[3] == (char)0x33) | ||
| 54 | return "GB18030"; | ||
| 55 | |||
| 56 | return ""; | ||
| 57 | } | ||
| 58 | |||
| 59 | bool CCharsetDetection::DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding) | ||
| 60 | { | ||
| 61 | detectedEncoding.clear(); | ||
| 62 | |||
| 63 | if (contentLength < 2) | ||
| 64 | return false; // too short for any detection | ||
| 65 | |||
| 66 | /* Byte Order Mark has priority over "encoding=" parameter */ | ||
| 67 | detectedEncoding = GetBomEncoding(xmlContent, contentLength); | ||
| 68 | if (!detectedEncoding.empty()) | ||
| 69 | return true; | ||
| 70 | |||
| 71 | /* try to read encoding from XML declaration */ | ||
| 72 | if (GetXmlEncodingFromDeclaration(xmlContent, contentLength, detectedEncoding)) | ||
| 73 | { | ||
| 74 | StringUtils::ToUpper(detectedEncoding); | ||
| 75 | |||
| 76 | /* make some safety checks */ | ||
| 77 | if (detectedEncoding == "UTF-8") | ||
| 78 | return true; // fast track for most common case | ||
| 79 | |||
| 80 | if (StringUtils::StartsWith(detectedEncoding, "UCS-") || StringUtils::StartsWith(detectedEncoding, "UTF-")) | ||
| 81 | { | ||
| 82 | if (detectedEncoding == "UTF-7") | ||
| 83 | return true; | ||
| 84 | |||
| 85 | /* XML declaration was detected in UTF-8 mode (by 'GetXmlEncodingFromDeclaration') so we know | ||
| 86 | * that text in single byte encoding, but declaration itself wrongly specify multibyte encoding */ | ||
| 87 | detectedEncoding.clear(); | ||
| 88 | return false; | ||
| 89 | } | ||
| 90 | return true; | ||
| 91 | } | ||
| 92 | |||
| 93 | /* try to detect basic encoding */ | ||
| 94 | std::string guessedEncoding; | ||
| 95 | if (!GuessXmlEncoding(xmlContent, contentLength, guessedEncoding)) | ||
| 96 | return false; /* can't detect any encoding */ | ||
| 97 | |||
| 98 | /* have some guessed encoding, try to use it */ | ||
| 99 | std::string convertedXml; | ||
| 100 | /* use 'm_XmlDeclarationMaxLength * 4' below for UTF-32-like encodings */ | ||
| 101 | if (!g_charsetConverter.ToUtf8(guessedEncoding, std::string(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength * 4)), convertedXml) | ||
| 102 | || convertedXml.empty()) | ||
| 103 | return false; /* can't convert, guessed encoding is wrong */ | ||
| 104 | |||
| 105 | /* text converted, hopefully at least XML declaration is in UTF-8 now */ | ||
| 106 | std::string declaredEncoding; | ||
| 107 | /* try to read real encoding from converted XML declaration */ | ||
| 108 | if (!GetXmlEncodingFromDeclaration(convertedXml.c_str(), convertedXml.length(), declaredEncoding)) | ||
| 109 | { /* did not find real encoding in XML declaration, use guessed encoding */ | ||
| 110 | detectedEncoding = guessedEncoding; | ||
| 111 | return true; | ||
| 112 | } | ||
| 113 | |||
| 114 | /* found encoding in converted XML declaration, we know correct endianness and number of bytes per char */ | ||
| 115 | /* make some safety checks */ | ||
| 116 | StringUtils::ToUpper(declaredEncoding); | ||
| 117 | if (declaredEncoding == guessedEncoding) | ||
| 118 | return true; | ||
| 119 | |||
| 120 | if (StringUtils::StartsWith(guessedEncoding, "UCS-4")) | ||
| 121 | { | ||
| 122 | if (declaredEncoding.length() < 5 || | ||
| 123 | (!StringUtils::StartsWith(declaredEncoding, "UTF-32") && !StringUtils::StartsWith(declaredEncoding, "UCS-4"))) | ||
| 124 | { /* Guessed encoding was correct because we can convert and read XML declaration, but declaration itself is wrong (not 4-bytes encoding) */ | ||
| 125 | detectedEncoding = guessedEncoding; | ||
| 126 | return true; | ||
| 127 | } | ||
| 128 | } | ||
| 129 | else if (StringUtils::StartsWith(guessedEncoding, "UTF-16")) | ||
| 130 | { | ||
| 131 | if (declaredEncoding.length() < 5 || | ||
| 132 | (!StringUtils::StartsWith(declaredEncoding, "UTF-16") && !StringUtils::StartsWith(declaredEncoding, "UCS-2"))) | ||
| 133 | { /* Guessed encoding was correct because we can read XML declaration, but declaration is wrong (not 2-bytes encoding) */ | ||
| 134 | detectedEncoding = guessedEncoding; | ||
| 135 | return true; | ||
| 136 | } | ||
| 137 | } | ||
| 138 | |||
| 139 | if (StringUtils::StartsWith(guessedEncoding, "UCS-4") || StringUtils::StartsWith(guessedEncoding, "UTF-16")) | ||
| 140 | { | ||
| 141 | /* Check endianness in declared encoding. We already know correct endianness as XML declaration was detected after conversion. */ | ||
| 142 | /* Guessed UTF/UCS encoding always ends with endianness */ | ||
| 143 | std::string guessedEndianness(guessedEncoding, guessedEncoding.length() - 2); | ||
| 144 | |||
| 145 | if (!StringUtils::EndsWith(declaredEncoding, "BE") && !StringUtils::EndsWith(declaredEncoding, "LE")) /* Declared encoding without endianness */ | ||
| 146 | detectedEncoding = declaredEncoding + guessedEndianness; /* add guessed endianness */ | ||
| 147 | else if (!StringUtils::EndsWith(declaredEncoding, guessedEndianness)) /* Wrong endianness in declared encoding */ | ||
| 148 | detectedEncoding = declaredEncoding.substr(0, declaredEncoding.length() - 2) + guessedEndianness; /* replace endianness by guessed endianness */ | ||
| 149 | else | ||
| 150 | detectedEncoding = declaredEncoding; /* declared encoding with correct endianness */ | ||
| 151 | |||
| 152 | return true; | ||
| 153 | } | ||
| 154 | else if (StringUtils::StartsWith(guessedEncoding, "EBCDIC")) | ||
| 155 | { | ||
| 156 | if (declaredEncoding.find("EBCDIC") != std::string::npos) | ||
| 157 | detectedEncoding = declaredEncoding; /* Declared encoding is some specific EBCDIC encoding */ | ||
| 158 | else | ||
| 159 | detectedEncoding = guessedEncoding; | ||
| 160 | |||
| 161 | return true; | ||
| 162 | } | ||
| 163 | |||
| 164 | /* should be unreachable */ | ||
| 165 | return false; | ||
| 166 | } | ||
| 167 | |||
| 168 | bool CCharsetDetection::GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding) | ||
| 169 | { | ||
| 170 | // following code is std::string-processing analog of regular expression-processing | ||
| 171 | // regular expression: "<\\?xml([ \n\r\t]+[^ \n\t\r>]+)*[ \n\r\t]+encoding[ \n\r\t]*=[ \n\r\t]*('[^ \n\t\r>']+'|\"[^ \n\t\r>\"]+\")" | ||
| 172 | // on win32 x86 machine regular expression is slower that std::string 20-40 times and can slowdown XML processing for several times | ||
| 173 | // seems that this regular expression is too slow due to many variable length parts, regexp for '&'-fixing is much faster | ||
| 174 | |||
| 175 | declaredEncoding.clear(); | ||
| 176 | |||
| 177 | // avoid extra large search | ||
| 178 | std::string strXml(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength)); | ||
| 179 | |||
| 180 | size_t pos = strXml.find("<?xml"); | ||
| 181 | if (pos == std::string::npos || pos + 6 > strXml.length() || pos > strXml.find('<')) | ||
| 182 | return false; // no "<?xml" declaration, "<?xml" is not first element or "<?xml" is incomplete | ||
| 183 | |||
| 184 | pos += 5; // 5 is length of "<?xml" | ||
| 185 | |||
| 186 | const size_t declLength = std::min(std::min(m_XmlDeclarationMaxLength, contentLength - pos), strXml.find('>', pos) - pos); | ||
| 187 | const std::string xmlDecl(xmlContent + pos, declLength); | ||
| 188 | const char* const xmlDeclC = xmlDecl.c_str(); // for faster processing of [] and for null-termination | ||
| 189 | |||
| 190 | static const char* const whiteSpaceChars = " \n\r\t"; // according to W3C Recommendation for XML, any of them can be used as separator | ||
| 191 | pos = 0; | ||
| 192 | |||
| 193 | while (pos + 12 <= declLength) // 12 is minimal length of "encoding='x'" | ||
| 194 | { | ||
| 195 | pos = xmlDecl.find_first_of(whiteSpaceChars, pos); | ||
| 196 | if (pos == std::string::npos) | ||
| 197 | return false; // no " encoding=" in declaration | ||
| 198 | |||
| 199 | pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos); | ||
| 200 | if (pos == std::string::npos) | ||
| 201 | return false; // no "encoding=" in declaration | ||
| 202 | |||
| 203 | if (xmlDecl.compare(pos, 8, "encoding", 8) != 0) | ||
| 204 | continue; // not "encoding" parameter | ||
| 205 | pos += 8; // length of "encoding" | ||
| 206 | |||
| 207 | if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated | ||
| 208 | { | ||
| 209 | pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos); | ||
| 210 | if (pos == std::string::npos) | ||
| 211 | return false; // this " encoding" is incomplete, only whitespace chars remains | ||
| 212 | } | ||
| 213 | if (xmlDeclC[pos] != '=') | ||
| 214 | { // "encoding" without "=", try to find other | ||
| 215 | pos--; // step back to whitespace | ||
| 216 | continue; | ||
| 217 | } | ||
| 218 | |||
| 219 | pos++; // skip '=' | ||
| 220 | if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated | ||
| 221 | { | ||
| 222 | pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos); | ||
| 223 | if (pos == std::string::npos) | ||
| 224 | return false; // this " encoding" is incomplete, only whitespace chars remains | ||
| 225 | } | ||
| 226 | size_t encNameEndPos; | ||
| 227 | if (xmlDeclC[pos] == '"') | ||
| 228 | encNameEndPos = xmlDecl.find('"', ++pos); | ||
| 229 | else if (xmlDeclC[pos] == '\'') | ||
| 230 | encNameEndPos = xmlDecl.find('\'', ++pos); | ||
| 231 | else | ||
| 232 | continue; // no quote or double quote after 'encoding=', try to find other | ||
| 233 | |||
| 234 | if (encNameEndPos != std::string::npos) | ||
| 235 | { | ||
| 236 | declaredEncoding.assign(xmlDecl, pos, encNameEndPos - pos); | ||
| 237 | return true; | ||
| 238 | } | ||
| 239 | // no closing quote or double quote after 'encoding="x', try to find other | ||
| 240 | } | ||
| 241 | |||
| 242 | return false; | ||
| 243 | } | ||
| 244 | |||
| 245 | bool CCharsetDetection::GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding) | ||
| 246 | { | ||
| 247 | supposedEncoding.clear(); | ||
| 248 | if (contentLength < 4) | ||
| 249 | return false; // too little data to guess | ||
| 250 | |||
| 251 | if (xmlContent[0] == 0 && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == (char)0x3C) // '<' == '00 00 00 3C' in UCS-4 (UTF-32) big-endian | ||
| 252 | supposedEncoding = "UCS-4BE"; // use UCS-4 according to W3C recommendation | ||
| 253 | else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == 0) // '<' == '3C 00 00 00' in UCS-4 (UTF-32) little-endian | ||
| 254 | supposedEncoding = "UCS-4LE"; // use UCS-4 according to W3C recommendation | ||
| 255 | else if (xmlContent[0] == 0 && xmlContent[1] == (char)0x3C && xmlContent[2] == 0 && xmlContent[3] == (char)0x3F) // "<?" == "00 3C 00 3F" in UTF-16 (UCS-2) big-endian | ||
| 256 | supposedEncoding = "UTF-16BE"; | ||
| 257 | else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == (char)0x3F && xmlContent[3] == 0) // "<?" == "3C 00 3F 00" in UTF-16 (UCS-2) little-endian | ||
| 258 | supposedEncoding = "UTF-16LE"; | ||
| 259 | else if (xmlContent[0] == (char)0x4C && xmlContent[1] == (char)0x6F && xmlContent[2] == (char)0xA7 && xmlContent[3] == (char)0x94) // "<?xm" == "4C 6F A7 94" in most EBCDIC encodings | ||
| 260 | supposedEncoding = "EBCDIC-CP-US"; // guessed value, real value must be read from declaration | ||
| 261 | else | ||
| 262 | return false; | ||
| 263 | |||
| 264 | return true; | ||
| 265 | } | ||
| 266 | |||
| 267 | bool CCharsetDetection::ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset) | ||
| 268 | { | ||
| 269 | converted.clear(); | ||
| 270 | usedHtmlCharset.clear(); | ||
| 271 | if (htmlContent.empty()) | ||
| 272 | { | ||
| 273 | usedHtmlCharset = "UTF-8"; // any charset can be used for empty content, use UTF-8 as default | ||
| 274 | return false; | ||
| 275 | } | ||
| 276 | |||
| 277 | // this is relaxed implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#determining-the-character-encoding | ||
| 278 | |||
| 279 | // try to get charset from Byte Order Mark | ||
| 280 | std::string bomCharset(GetBomEncoding(htmlContent)); | ||
| 281 | if (checkConversion(bomCharset, htmlContent, converted)) | ||
| 282 | { | ||
| 283 | usedHtmlCharset = bomCharset; | ||
| 284 | return true; | ||
| 285 | } | ||
| 286 | |||
| 287 | // try charset from HTTP header (or from other out-of-band source) | ||
| 288 | if (checkConversion(serverReportedCharset, htmlContent, converted)) | ||
| 289 | { | ||
| 290 | usedHtmlCharset = serverReportedCharset; | ||
| 291 | return true; | ||
| 292 | } | ||
| 293 | |||
| 294 | // try to find charset in HTML | ||
| 295 | std::string declaredCharset(GetHtmlEncodingFromHead(htmlContent)); | ||
| 296 | if (!declaredCharset.empty()) | ||
| 297 | { | ||
| 298 | if (declaredCharset.compare(0, 3, "UTF", 3) == 0) | ||
| 299 | declaredCharset = "UTF-8"; // charset string was found in singlebyte mode, charset can't be multibyte encoding | ||
| 300 | if (checkConversion(declaredCharset, htmlContent, converted)) | ||
| 301 | { | ||
| 302 | usedHtmlCharset = declaredCharset; | ||
| 303 | return true; | ||
| 304 | } | ||
| 305 | } | ||
| 306 | |||
| 307 | // try UTF-8 if not tried before | ||
| 308 | if (bomCharset != "UTF-8" && serverReportedCharset != "UTF-8" && declaredCharset != "UTF-8" && checkConversion("UTF-8", htmlContent, converted)) | ||
| 309 | { | ||
| 310 | usedHtmlCharset = "UTF-8"; | ||
| 311 | return false; // only guessed value | ||
| 312 | } | ||
| 313 | |||
| 314 | // try user charset | ||
| 315 | std::string userCharset(g_langInfo.GetGuiCharSet()); | ||
| 316 | if (checkConversion(userCharset, htmlContent, converted)) | ||
| 317 | { | ||
| 318 | usedHtmlCharset = userCharset; | ||
| 319 | return false; // only guessed value | ||
| 320 | } | ||
| 321 | |||
| 322 | // try WINDOWS-1252 | ||
| 323 | if (checkConversion("WINDOWS-1252", htmlContent, converted)) | ||
| 324 | { | ||
| 325 | usedHtmlCharset = "WINDOWS-1252"; | ||
| 326 | return false; // only guessed value | ||
| 327 | } | ||
| 328 | |||
| 329 | // can't find exact charset | ||
| 330 | // use one of detected as fallback | ||
| 331 | if (!bomCharset.empty()) | ||
| 332 | usedHtmlCharset = bomCharset; | ||
| 333 | else if (!serverReportedCharset.empty()) | ||
| 334 | usedHtmlCharset = serverReportedCharset; | ||
| 335 | else if (!declaredCharset.empty()) | ||
| 336 | usedHtmlCharset = declaredCharset; | ||
| 337 | else if (!userCharset.empty()) | ||
| 338 | usedHtmlCharset = userCharset; | ||
| 339 | else | ||
| 340 | usedHtmlCharset = "WINDOWS-1252"; | ||
| 341 | |||
| 342 | CLog::Log(LOGWARNING, "%s: Can't correctly convert to UTF-8 charset, converting as \"%s\"", __FUNCTION__, usedHtmlCharset.c_str()); | ||
| 343 | g_charsetConverter.ToUtf8(usedHtmlCharset, htmlContent, converted, false); | ||
| 344 | |||
| 345 | return false; | ||
| 346 | } | ||
| 347 | |||
| 348 | bool CCharsetDetection::ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset) | ||
| 349 | { | ||
| 350 | converted.clear(); | ||
| 351 | usedCharset.clear(); | ||
| 352 | if (textContent.empty()) | ||
| 353 | { | ||
| 354 | usedCharset = "UTF-8"; // any charset can be used for empty content, use UTF-8 as default | ||
| 355 | return true; | ||
| 356 | } | ||
| 357 | |||
| 358 | // try to get charset from Byte Order Mark | ||
| 359 | std::string bomCharset(GetBomEncoding(textContent)); | ||
| 360 | if (checkConversion(bomCharset, textContent, converted)) | ||
| 361 | { | ||
| 362 | usedCharset = bomCharset; | ||
| 363 | return true; | ||
| 364 | } | ||
| 365 | |||
| 366 | // try charset from HTTP header (or from other out-of-band source) | ||
| 367 | if (checkConversion(serverReportedCharset, textContent, converted)) | ||
| 368 | { | ||
| 369 | usedCharset = serverReportedCharset; | ||
| 370 | return true; | ||
| 371 | } | ||
| 372 | |||
| 373 | // try UTF-8 if not tried before | ||
| 374 | if (bomCharset != "UTF-8" && serverReportedCharset != "UTF-8" && checkConversion("UTF-8", textContent, converted)) | ||
| 375 | { | ||
| 376 | usedCharset = "UTF-8"; | ||
| 377 | return true; | ||
| 378 | } | ||
| 379 | |||
| 380 | // try user charset | ||
| 381 | std::string userCharset(g_langInfo.GetGuiCharSet()); | ||
| 382 | if (checkConversion(userCharset, textContent, converted)) | ||
| 383 | { | ||
| 384 | usedCharset = userCharset; | ||
| 385 | return true; | ||
| 386 | } | ||
| 387 | |||
| 388 | // try system default charset | ||
| 389 | if (g_charsetConverter.systemToUtf8(textContent, converted, true)) | ||
| 390 | { | ||
| 391 | usedCharset = "char"; // synonym to system charset | ||
| 392 | return true; | ||
| 393 | } | ||
| 394 | |||
| 395 | // try WINDOWS-1252 | ||
| 396 | if (checkConversion("WINDOWS-1252", textContent, converted)) | ||
| 397 | { | ||
| 398 | usedCharset = "WINDOWS-1252"; | ||
| 399 | return true; | ||
| 400 | } | ||
| 401 | |||
| 402 | // can't find correct charset | ||
| 403 | // use one of detected as fallback | ||
| 404 | if (!serverReportedCharset.empty()) | ||
| 405 | usedCharset = serverReportedCharset; | ||
| 406 | else if (!bomCharset.empty()) | ||
| 407 | usedCharset = bomCharset; | ||
| 408 | else if (!userCharset.empty()) | ||
| 409 | usedCharset = userCharset; | ||
| 410 | else | ||
| 411 | usedCharset = "WINDOWS-1252"; | ||
| 412 | |||
| 413 | CLog::Log(LOGWARNING, "%s: Can't correctly convert to UTF-8 charset, converting as \"%s\"", __FUNCTION__, usedCharset.c_str()); | ||
| 414 | g_charsetConverter.ToUtf8(usedCharset, textContent, converted, false); | ||
| 415 | |||
| 416 | return false; | ||
| 417 | } | ||
| 418 | |||
| 419 | |||
| 420 | bool CCharsetDetection::checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst) | ||
| 421 | { | ||
| 422 | if (srcCharset.empty()) | ||
| 423 | return false; | ||
| 424 | |||
| 425 | if (srcCharset != "UTF-8") | ||
| 426 | { | ||
| 427 | if (g_charsetConverter.ToUtf8(srcCharset, src, dst, true)) | ||
| 428 | return true; | ||
| 429 | } | ||
| 430 | else if (CUtf8Utils::isValidUtf8(src)) | ||
| 431 | { | ||
| 432 | dst = src; | ||
| 433 | return true; | ||
| 434 | } | ||
| 435 | |||
| 436 | return false; | ||
| 437 | } | ||
| 438 | |||
| 439 | std::string CCharsetDetection::GetHtmlEncodingFromHead(const std::string& htmlContent) | ||
| 440 | { | ||
| 441 | std::string smallerHtmlContent; | ||
| 442 | if (htmlContent.length() > 2 * m_HtmlCharsetEndSearchPos) | ||
| 443 | smallerHtmlContent.assign(htmlContent, 0, 2 * m_HtmlCharsetEndSearchPos); // use twice more bytes to search for charset for safety | ||
| 444 | |||
| 445 | const std::string& html = smallerHtmlContent.empty() ? htmlContent : smallerHtmlContent; // limit search | ||
| 446 | const char* const htmlC = html.c_str(); // for null-termination | ||
| 447 | const size_t len = html.length(); | ||
| 448 | |||
| 449 | // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#prescan-a-byte-stream-to-determine-its-encoding | ||
| 450 | // labels in comments correspond to the labels in HTML5 standard | ||
| 451 | // note: opposite to standard, everything is converted to uppercase instead of lower case | ||
| 452 | size_t pos = 0; | ||
| 453 | while (pos < len) // "loop" label | ||
| 454 | { | ||
| 455 | if (html.compare(pos, 4, "<!--", 4) == 0) | ||
| 456 | { | ||
| 457 | pos = html.find("-->", pos + 2); | ||
| 458 | if (pos == std::string::npos) | ||
| 459 | return ""; | ||
| 460 | pos += 2; | ||
| 461 | } | ||
| 462 | else if (htmlC[pos] == '<' && (htmlC[pos + 1] == 'm' || htmlC[pos + 1] == 'M') && (htmlC[pos + 2] == 'e' || htmlC[pos + 2] == 'E') | ||
| 463 | && (htmlC[pos + 3] == 't' || htmlC[pos + 3] == 'T') && (htmlC[pos + 4] == 'a' || htmlC[pos + 4] == 'A') | ||
| 464 | && (htmlC[pos + 5] == 0x09 || htmlC[pos + 5] == 0x0A || htmlC[pos + 5] == 0x0C || htmlC[pos + 5] == 0x0D || htmlC[pos + 5] == 0x20 || htmlC[pos + 5] == 0x2F)) | ||
| 465 | { // this is case insensitive "<meta" and one of tab, LF, FF, CR, space or slash | ||
| 466 | pos += 5; // "pos" points to symbol after "<meta" | ||
| 467 | std::string attrName, attrValue; | ||
| 468 | bool gotPragma = false; | ||
| 469 | std::string contentCharset; | ||
| 470 | do // "attributes" label | ||
| 471 | { | ||
| 472 | pos = GetHtmlAttribute(html, pos, attrName, attrValue); | ||
| 473 | if (attrName == "HTTP-EQUIV" && attrValue == "CONTENT-TYPE") | ||
| 474 | gotPragma = true; | ||
| 475 | else if (attrName == "CONTENT") | ||
| 476 | contentCharset = ExtractEncodingFromHtmlMeta(attrValue); | ||
| 477 | else if (attrName == "CHARSET") | ||
| 478 | { | ||
| 479 | StringUtils::Trim(attrValue, m_HtmlWhitespaceChars.c_str()); // tab, LF, FF, CR, space | ||
| 480 | if (!attrValue.empty()) | ||
| 481 | return attrValue; | ||
| 482 | } | ||
| 483 | } while (!attrName.empty() && pos < len); | ||
| 484 | |||
| 485 | // "processing" label | ||
| 486 | if (gotPragma && !contentCharset.empty()) | ||
| 487 | return contentCharset; | ||
| 488 | } | ||
| 489 | else if (htmlC[pos] == '<' && ((htmlC[pos + 1] >= 'A' && htmlC[pos + 1] <= 'Z') || (htmlC[pos + 1] >= 'a' && htmlC[pos + 1] <= 'z'))) | ||
| 490 | { | ||
| 491 | pos = html.find_first_of("\x09\x0A\x0C\x0D >", pos); // tab, LF, FF, CR, space or '>' | ||
| 492 | std::string attrName, attrValue; | ||
| 493 | do | ||
| 494 | { | ||
| 495 | pos = GetHtmlAttribute(html, pos, attrName, attrValue); | ||
| 496 | } while (pos < len && !attrName.empty()); | ||
| 497 | } | ||
| 498 | else if (html.compare(pos, 2, "<!", 2) == 0 || html.compare(pos, 2, "</", 2) == 0 || html.compare(pos, 2, "<?", 2) == 0) | ||
| 499 | pos = html.find('>', pos); | ||
| 500 | |||
| 501 | if (pos == std::string::npos) | ||
| 502 | return ""; | ||
| 503 | |||
| 504 | // "next byte" label | ||
| 505 | pos++; | ||
| 506 | } | ||
| 507 | |||
| 508 | return ""; // no charset was found | ||
| 509 | } | ||
| 510 | |||
| 511 | size_t CCharsetDetection::GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& attrName, std::string& attrValue) | ||
| 512 | { | ||
| 513 | attrName.clear(); | ||
| 514 | attrValue.clear(); | ||
| 515 | static const char* const htmlWhitespaceSlash = "\x09\x0A\x0C\x0D\x20\x2F"; // tab, LF, FF, CR, space or slash | ||
| 516 | const char* const htmlC = htmlContent.c_str(); | ||
| 517 | const size_t len = htmlContent.length(); | ||
| 518 | |||
| 519 | // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#concept-get-attributes-when-sniffing | ||
| 520 | // labels in comments correspond to the labels in HTML5 standard | ||
| 521 | // note: opposite to standard, everything is converted to uppercase instead of lower case | ||
| 522 | pos = htmlContent.find_first_not_of(htmlWhitespaceSlash, pos); | ||
| 523 | if (pos == std::string::npos || htmlC[pos] == '>') | ||
| 524 | return pos; // only white spaces or slashes up to the end of the htmlContent or no more attributes | ||
| 525 | |||
| 526 | while (pos < len && htmlC[pos] != '=') | ||
| 527 | { | ||
| 528 | const char chr = htmlC[pos]; | ||
| 529 | if (chr == '/' || chr == '>') | ||
| 530 | return pos; // no attributes or empty attribute value | ||
| 531 | else if (m_HtmlWhitespaceChars.find(chr) != std::string::npos) // chr is one of whitespaces | ||
| 532 | { | ||
| 533 | pos = htmlContent.find_first_not_of(m_HtmlWhitespaceChars, pos); // "spaces" label | ||
| 534 | if (pos == std::string::npos || htmlC[pos] != '=') | ||
| 535 | return pos; // only white spaces up to the end or no attribute value | ||
| 536 | break; | ||
| 537 | } | ||
| 538 | else | ||
| 539 | appendCharAsAsciiUpperCase(attrName, chr); | ||
| 540 | |||
| 541 | pos++; | ||
| 542 | } | ||
| 543 | |||
| 544 | if (pos >= len) | ||
| 545 | return std::string::npos; // no '=', '/' or '>' were found up to the end of htmlContent | ||
| 546 | |||
| 547 | pos++; // advance pos to character after '=' | ||
| 548 | |||
| 549 | pos = htmlContent.find_first_not_of(m_HtmlWhitespaceChars, pos); // "value" label | ||
| 550 | if (pos == std::string::npos) | ||
| 551 | return pos; // only white spaces remain in htmlContent | ||
| 552 | |||
| 553 | if (htmlC[pos] == '>') | ||
| 554 | return pos; // empty attribute value | ||
| 555 | else if (htmlC[pos] == '"' || htmlC[pos] == '\'') | ||
| 556 | { | ||
| 557 | const char qChr = htmlC[pos]; | ||
| 558 | // "quote loop" label | ||
| 559 | while (++pos < len) | ||
| 560 | { | ||
| 561 | const char chr = htmlC[pos]; | ||
| 562 | if (chr == qChr) | ||
| 563 | return pos + 1; | ||
| 564 | else | ||
| 565 | appendCharAsAsciiUpperCase(attrValue, chr); | ||
| 566 | } | ||
| 567 | return std::string::npos; // no closing quote is found | ||
| 568 | } | ||
| 569 | |||
| 570 | appendCharAsAsciiUpperCase(attrValue, htmlC[pos]); | ||
| 571 | pos++; | ||
| 572 | |||
| 573 | while (pos < len) | ||
| 574 | { | ||
| 575 | const char chr = htmlC[pos]; | ||
| 576 | if (m_HtmlWhitespaceChars.find(chr) != std::string::npos || chr == '>') | ||
| 577 | return pos; | ||
| 578 | else | ||
| 579 | appendCharAsAsciiUpperCase(attrValue, chr); | ||
| 580 | |||
| 581 | pos++; | ||
| 582 | } | ||
| 583 | |||
| 584 | return std::string::npos; // rest of htmlContent was attribute value | ||
| 585 | } | ||
| 586 | |||
| 587 | std::string CCharsetDetection::ExtractEncodingFromHtmlMeta(std::string metaContent, size_t pos /*= 0*/) | ||
| 588 | { | ||
| 589 | size_t len = metaContent.length(); | ||
| 590 | if (pos >= len) | ||
| 591 | return ""; | ||
| 592 | |||
| 593 | const char* const metaContentC = metaContent.c_str(); | ||
| 594 | |||
| 595 | // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element | ||
| 596 | // labels in comments correspond to the labels in HTML5 standard | ||
| 597 | // note: opposite to standard, case sensitive match is used as argument is always in uppercase | ||
| 598 | std::string charset; | ||
| 599 | do | ||
| 600 | { | ||
| 601 | // "loop" label | ||
| 602 | pos = metaContent.find("CHARSET", pos); | ||
| 603 | if (pos == std::string::npos) | ||
| 604 | return ""; | ||
| 605 | |||
| 606 | pos = metaContent.find_first_not_of(m_HtmlWhitespaceChars, pos + 7); // '7' is the length of 'CHARSET' | ||
| 607 | if (pos != std::string::npos && metaContentC[pos] == '=') | ||
| 608 | { | ||
| 609 | pos = metaContent.find_first_not_of(m_HtmlWhitespaceChars, pos + 1); | ||
| 610 | if (pos != std::string::npos) | ||
| 611 | { | ||
| 612 | if (metaContentC[pos] == '\'' || metaContentC[pos] == '"') | ||
| 613 | { | ||
| 614 | const char qChr = metaContentC[pos]; | ||
| 615 | pos++; | ||
| 616 | const size_t closeQpos = metaContent.find(qChr, pos); | ||
| 617 | if (closeQpos != std::string::npos) | ||
| 618 | charset.assign(metaContent, pos, closeQpos - pos); | ||
| 619 | } | ||
| 620 | else | ||
| 621 | charset.assign(metaContent, pos, metaContent.find("\x09\x0A\x0C\x0D ;", pos) - pos); // assign content up to the next tab, LF, FF, CR, space, semicolon or end of string | ||
| 622 | } | ||
| 623 | break; | ||
| 624 | } | ||
| 625 | } while (pos < len); | ||
| 626 | |||
| 627 | static const char* const htmlWhitespaceCharsC = m_HtmlWhitespaceChars.c_str(); | ||
| 628 | StringUtils::Trim(charset, htmlWhitespaceCharsC); | ||
| 629 | |||
| 630 | return charset; | ||
| 631 | } | ||
| 632 | |||
| 633 | inline void CCharsetDetection::appendCharAsAsciiUpperCase(std::string& str, const char chr) | ||
| 634 | { | ||
| 635 | if (chr >= 'a' && chr <= 'z') | ||
| 636 | str.push_back(chr - ('a' - 'A')); // convert to upper case | ||
| 637 | else | ||
| 638 | str.push_back(chr); | ||
| 639 | } | ||
