summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/CharsetDetection.h
diff options
context:
space:
mode:
Diffstat (limited to 'xbmc/utils/CharsetDetection.h')
-rw-r--r--xbmc/utils/CharsetDetection.h94
1 files changed, 94 insertions, 0 deletions
diff --git a/xbmc/utils/CharsetDetection.h b/xbmc/utils/CharsetDetection.h
new file mode 100644
index 0000000..1ff3905
--- /dev/null
+++ b/xbmc/utils/CharsetDetection.h
@@ -0,0 +1,94 @@
1/*
2 * Copyright (C) 2013-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
8
9#pragma once
10
11#include <string>
12
13
14class CCharsetDetection
15{
16public:
17 /**
18 * Detect text encoding by Byte Order Mark
19 * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
20 * @param content pointer to text to analyze
21 * @param contentLength length of text
22 * @return detected encoding or empty string if BOM not detected
23 */
24 static std::string GetBomEncoding(const char* const content, const size_t contentLength);
25 /**
26 * Detect text encoding by Byte Order Mark
27 * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
28 * @param content the text to analyze
29 * @return detected encoding or empty string if BOM not detected
30 */
31 static inline std::string GetBomEncoding(const std::string& content)
32 { return GetBomEncoding(content.c_str(), content.length()); }
33
34 static inline bool DetectXmlEncoding(const std::string& xmlContent, std::string& detectedEncoding)
35 { return DetectXmlEncoding(xmlContent.c_str(), xmlContent.length(), detectedEncoding); }
36
37 static bool DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding);
38
39 /**
40 * Detect HTML charset and HTML convert to UTF-8
41 * @param htmlContent content of HTML file
42 * @param converted receive result of conversion
43 * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
44 * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
45 */
46 static inline bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset = "")
47 {
48 std::string usedHtmlCharset;
49 return ConvertHtmlToUtf8(htmlContent, converted, serverReportedCharset, usedHtmlCharset);
50 }
51 /**
52 * Detect HTML charset and HTML convert to UTF-8
53 * @param htmlContent content of HTML file
54 * @param converted receive result of conversion
55 * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
56 * @param usedHtmlCharset receive charset used for conversion
57 * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
58 */
59 static bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset);
60
61 /**
62 * Try to convert plain text to UTF-8 using best suitable charset
63 * @param textContent text to convert
64 * @param converted receive result of conversion
65 * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
66 * @param usedCharset receive charset used for conversion
67 * @return true if converted without errors, false otherwise
68 */
69 static bool ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset);
70
71private:
72 static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding);
73 /**
74 * Try to guess text encoding by searching for '<?xml' mark in different encodings
75 * Multibyte encodings (UTF/UCS) always ends with explicit endianness (LE/BE)
76 * @param content pointer to text to analyze
77 * @param contentLength length of text
78 * @param detectedEncoding reference to variable that receive supposed encoding
79 * @return true if any encoding supposed, false otherwise
80 */
81 static bool GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding);
82
83 static std::string GetHtmlEncodingFromHead(const std::string& htmlContent);
84 static size_t GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& atrName, std::string& strValue);
85 static std::string ExtractEncodingFromHtmlMeta(std::string metaContent, size_t pos = 0);
86
87 static bool checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst);
88 static void appendCharAsAsciiUpperCase(std::string& str, const char chr);
89
90 static const size_t m_XmlDeclarationMaxLength;
91 static const size_t m_HtmlCharsetEndSearchPos;
92
93 static const std::string m_HtmlWhitespaceChars;
94};