diff options
| author | manuel <manuel@mausz.at> | 2020-10-19 00:52:24 +0200 |
|---|---|---|
| committer | manuel <manuel@mausz.at> | 2020-10-19 00:52:24 +0200 |
| commit | be933ef2241d79558f91796cc5b3a161f72ebf9c (patch) | |
| tree | fe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/CharsetDetection.h | |
| parent | 5f8335c1e49ce108ef3481863833c98efa00411b (diff) | |
| download | kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2 kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip | |
sync with upstream
Diffstat (limited to 'xbmc/utils/CharsetDetection.h')
| -rw-r--r-- | xbmc/utils/CharsetDetection.h | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/xbmc/utils/CharsetDetection.h b/xbmc/utils/CharsetDetection.h new file mode 100644 index 0000000..1ff3905 --- /dev/null +++ b/xbmc/utils/CharsetDetection.h | |||
| @@ -0,0 +1,94 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2013-2018 Team Kodi | ||
| 3 | * This file is part of Kodi - https://kodi.tv | ||
| 4 | * | ||
| 5 | * SPDX-License-Identifier: GPL-2.0-or-later | ||
| 6 | * See LICENSES/README.md for more information. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #pragma once | ||
| 10 | |||
| 11 | #include <string> | ||
| 12 | |||
| 13 | |||
| 14 | class CCharsetDetection | ||
| 15 | { | ||
| 16 | public: | ||
| 17 | /** | ||
| 18 | * Detect text encoding by Byte Order Mark | ||
| 19 | * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE) | ||
| 20 | * @param content pointer to text to analyze | ||
| 21 | * @param contentLength length of text | ||
| 22 | * @return detected encoding or empty string if BOM not detected | ||
| 23 | */ | ||
| 24 | static std::string GetBomEncoding(const char* const content, const size_t contentLength); | ||
| 25 | /** | ||
| 26 | * Detect text encoding by Byte Order Mark | ||
| 27 | * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE) | ||
| 28 | * @param content the text to analyze | ||
| 29 | * @return detected encoding or empty string if BOM not detected | ||
| 30 | */ | ||
| 31 | static inline std::string GetBomEncoding(const std::string& content) | ||
| 32 | { return GetBomEncoding(content.c_str(), content.length()); } | ||
| 33 | |||
| 34 | static inline bool DetectXmlEncoding(const std::string& xmlContent, std::string& detectedEncoding) | ||
| 35 | { return DetectXmlEncoding(xmlContent.c_str(), xmlContent.length(), detectedEncoding); } | ||
| 36 | |||
| 37 | static bool DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding); | ||
| 38 | |||
| 39 | /** | ||
| 40 | * Detect HTML charset and HTML convert to UTF-8 | ||
| 41 | * @param htmlContent content of HTML file | ||
| 42 | * @param converted receive result of conversion | ||
| 43 | * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset | ||
| 44 | * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed | ||
| 45 | */ | ||
| 46 | static inline bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset = "") | ||
| 47 | { | ||
| 48 | std::string usedHtmlCharset; | ||
| 49 | return ConvertHtmlToUtf8(htmlContent, converted, serverReportedCharset, usedHtmlCharset); | ||
| 50 | } | ||
| 51 | /** | ||
| 52 | * Detect HTML charset and HTML convert to UTF-8 | ||
| 53 | * @param htmlContent content of HTML file | ||
| 54 | * @param converted receive result of conversion | ||
| 55 | * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset | ||
| 56 | * @param usedHtmlCharset receive charset used for conversion | ||
| 57 | * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed | ||
| 58 | */ | ||
| 59 | static bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset); | ||
| 60 | |||
| 61 | /** | ||
| 62 | * Try to convert plain text to UTF-8 using best suitable charset | ||
| 63 | * @param textContent text to convert | ||
| 64 | * @param converted receive result of conversion | ||
| 65 | * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset | ||
| 66 | * @param usedCharset receive charset used for conversion | ||
| 67 | * @return true if converted without errors, false otherwise | ||
| 68 | */ | ||
| 69 | static bool ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset); | ||
| 70 | |||
| 71 | private: | ||
| 72 | static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding); | ||
| 73 | /** | ||
| 74 | * Try to guess text encoding by searching for '<?xml' mark in different encodings | ||
| 75 | * Multibyte encodings (UTF/UCS) always ends with explicit endianness (LE/BE) | ||
| 76 | * @param content pointer to text to analyze | ||
| 77 | * @param contentLength length of text | ||
| 78 | * @param detectedEncoding reference to variable that receive supposed encoding | ||
| 79 | * @return true if any encoding supposed, false otherwise | ||
| 80 | */ | ||
| 81 | static bool GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding); | ||
| 82 | |||
| 83 | static std::string GetHtmlEncodingFromHead(const std::string& htmlContent); | ||
| 84 | static size_t GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& atrName, std::string& strValue); | ||
| 85 | static std::string ExtractEncodingFromHtmlMeta(std::string metaContent, size_t pos = 0); | ||
| 86 | |||
| 87 | static bool checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst); | ||
| 88 | static void appendCharAsAsciiUpperCase(std::string& str, const char chr); | ||
| 89 | |||
| 90 | static const size_t m_XmlDeclarationMaxLength; | ||
| 91 | static const size_t m_HtmlCharsetEndSearchPos; | ||
| 92 | |||
| 93 | static const std::string m_HtmlWhitespaceChars; | ||
| 94 | }; | ||
