summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/CharsetDetection.cpp
diff options
context:
space:
mode:
authormanuel <manuel@mausz.at>2020-10-19 00:52:24 +0200
committermanuel <manuel@mausz.at>2020-10-19 00:52:24 +0200
commitbe933ef2241d79558f91796cc5b3a161f72ebf9c (patch)
treefe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/CharsetDetection.cpp
parent5f8335c1e49ce108ef3481863833c98efa00411b (diff)
downloadkodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip
sync with upstream
Diffstat (limited to 'xbmc/utils/CharsetDetection.cpp')
-rw-r--r--xbmc/utils/CharsetDetection.cpp639
1 files changed, 639 insertions, 0 deletions
diff --git a/xbmc/utils/CharsetDetection.cpp b/xbmc/utils/CharsetDetection.cpp
new file mode 100644
index 0000000..06a0416
--- /dev/null
+++ b/xbmc/utils/CharsetDetection.cpp
@@ -0,0 +1,639 @@
1/*
2 * Copyright (C) 2013-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
8
9#include "CharsetDetection.h"
10
11#include "LangInfo.h"
12#include "utils/CharsetConverter.h"
13#include "utils/StringUtils.h"
14#include "utils/Utf8Utils.h"
15#include "utils/log.h"
16
17#include <algorithm>
18
19/* XML declaration can be virtually any size (with many-many whitespaces)
20 * but for in real world we don't need to process megabytes of data
21 * so limit search for XML declaration to reasonable value */
22const size_t CCharsetDetection::m_XmlDeclarationMaxLength = 250;
23
24/* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#charset
25 * encoding must be placed in first 1024 bytes of document */
26const size_t CCharsetDetection::m_HtmlCharsetEndSearchPos = 1024;
27
28/* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#space-character
29 * tab, LF, FF, CR or space can be used as whitespace */
30const std::string CCharsetDetection::m_HtmlWhitespaceChars("\x09\x0A\x0C\x0D\x20"); // tab, LF, FF, CR and space
31
32std::string CCharsetDetection::GetBomEncoding(const char* const content, const size_t contentLength)
33{
34 if (contentLength < 2)
35 return "";
36 if (content[0] == (char)0xFE && content[1] == (char)0xFF)
37 return "UTF-16BE";
38 if (contentLength >= 4 && content[0] == (char)0xFF && content[1] == (char)0xFE && content[2] == (char)0x00 && content[3] == (char)0x00)
39 return "UTF-32LE"; /* first two bytes are same for UTF-16LE and UTF-32LE, so first check for full UTF-32LE mark */
40 if (content[0] == (char)0xFF && content[1] == (char)0xFE)
41 return "UTF-16LE";
42 if (contentLength < 3)
43 return "";
44 if (content[0] == (char)0xEF && content[1] == (char)0xBB && content[2] == (char)0xBF)
45 return "UTF-8";
46 if (contentLength < 4)
47 return "";
48 if (content[0] == (char)0x00 && content[1] == (char)0x00 && content[2] == (char)0xFE && content[3] == (char)0xFF)
49 return "UTF-32BE";
50 if (contentLength >= 5 && content[0] == (char)0x2B && content[1] == (char)0x2F && content[2] == (char)0x76 &&
51 (content[4] == (char)0x32 || content[4] == (char)0x39 || content[4] == (char)0x2B || content[4] == (char)0x2F))
52 return "UTF-7";
53 if (content[0] == (char)0x84 && content[1] == (char)0x31 && content[2] == (char)0x95 && content[3] == (char)0x33)
54 return "GB18030";
55
56 return "";
57}
58
59bool CCharsetDetection::DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding)
60{
61 detectedEncoding.clear();
62
63 if (contentLength < 2)
64 return false; // too short for any detection
65
66 /* Byte Order Mark has priority over "encoding=" parameter */
67 detectedEncoding = GetBomEncoding(xmlContent, contentLength);
68 if (!detectedEncoding.empty())
69 return true;
70
71 /* try to read encoding from XML declaration */
72 if (GetXmlEncodingFromDeclaration(xmlContent, contentLength, detectedEncoding))
73 {
74 StringUtils::ToUpper(detectedEncoding);
75
76 /* make some safety checks */
77 if (detectedEncoding == "UTF-8")
78 return true; // fast track for most common case
79
80 if (StringUtils::StartsWith(detectedEncoding, "UCS-") || StringUtils::StartsWith(detectedEncoding, "UTF-"))
81 {
82 if (detectedEncoding == "UTF-7")
83 return true;
84
85 /* XML declaration was detected in UTF-8 mode (by 'GetXmlEncodingFromDeclaration') so we know
86 * that text in single byte encoding, but declaration itself wrongly specify multibyte encoding */
87 detectedEncoding.clear();
88 return false;
89 }
90 return true;
91 }
92
93 /* try to detect basic encoding */
94 std::string guessedEncoding;
95 if (!GuessXmlEncoding(xmlContent, contentLength, guessedEncoding))
96 return false; /* can't detect any encoding */
97
98 /* have some guessed encoding, try to use it */
99 std::string convertedXml;
100 /* use 'm_XmlDeclarationMaxLength * 4' below for UTF-32-like encodings */
101 if (!g_charsetConverter.ToUtf8(guessedEncoding, std::string(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength * 4)), convertedXml)
102 || convertedXml.empty())
103 return false; /* can't convert, guessed encoding is wrong */
104
105 /* text converted, hopefully at least XML declaration is in UTF-8 now */
106 std::string declaredEncoding;
107 /* try to read real encoding from converted XML declaration */
108 if (!GetXmlEncodingFromDeclaration(convertedXml.c_str(), convertedXml.length(), declaredEncoding))
109 { /* did not find real encoding in XML declaration, use guessed encoding */
110 detectedEncoding = guessedEncoding;
111 return true;
112 }
113
114 /* found encoding in converted XML declaration, we know correct endianness and number of bytes per char */
115 /* make some safety checks */
116 StringUtils::ToUpper(declaredEncoding);
117 if (declaredEncoding == guessedEncoding)
118 return true;
119
120 if (StringUtils::StartsWith(guessedEncoding, "UCS-4"))
121 {
122 if (declaredEncoding.length() < 5 ||
123 (!StringUtils::StartsWith(declaredEncoding, "UTF-32") && !StringUtils::StartsWith(declaredEncoding, "UCS-4")))
124 { /* Guessed encoding was correct because we can convert and read XML declaration, but declaration itself is wrong (not 4-bytes encoding) */
125 detectedEncoding = guessedEncoding;
126 return true;
127 }
128 }
129 else if (StringUtils::StartsWith(guessedEncoding, "UTF-16"))
130 {
131 if (declaredEncoding.length() < 5 ||
132 (!StringUtils::StartsWith(declaredEncoding, "UTF-16") && !StringUtils::StartsWith(declaredEncoding, "UCS-2")))
133 { /* Guessed encoding was correct because we can read XML declaration, but declaration is wrong (not 2-bytes encoding) */
134 detectedEncoding = guessedEncoding;
135 return true;
136 }
137 }
138
139 if (StringUtils::StartsWith(guessedEncoding, "UCS-4") || StringUtils::StartsWith(guessedEncoding, "UTF-16"))
140 {
141 /* Check endianness in declared encoding. We already know correct endianness as XML declaration was detected after conversion. */
142 /* Guessed UTF/UCS encoding always ends with endianness */
143 std::string guessedEndianness(guessedEncoding, guessedEncoding.length() - 2);
144
145 if (!StringUtils::EndsWith(declaredEncoding, "BE") && !StringUtils::EndsWith(declaredEncoding, "LE")) /* Declared encoding without endianness */
146 detectedEncoding = declaredEncoding + guessedEndianness; /* add guessed endianness */
147 else if (!StringUtils::EndsWith(declaredEncoding, guessedEndianness)) /* Wrong endianness in declared encoding */
148 detectedEncoding = declaredEncoding.substr(0, declaredEncoding.length() - 2) + guessedEndianness; /* replace endianness by guessed endianness */
149 else
150 detectedEncoding = declaredEncoding; /* declared encoding with correct endianness */
151
152 return true;
153 }
154 else if (StringUtils::StartsWith(guessedEncoding, "EBCDIC"))
155 {
156 if (declaredEncoding.find("EBCDIC") != std::string::npos)
157 detectedEncoding = declaredEncoding; /* Declared encoding is some specific EBCDIC encoding */
158 else
159 detectedEncoding = guessedEncoding;
160
161 return true;
162 }
163
164 /* should be unreachable */
165 return false;
166}
167
168bool CCharsetDetection::GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding)
169{
170 // following code is std::string-processing analog of regular expression-processing
171 // regular expression: "<\\?xml([ \n\r\t]+[^ \n\t\r>]+)*[ \n\r\t]+encoding[ \n\r\t]*=[ \n\r\t]*('[^ \n\t\r>']+'|\"[^ \n\t\r>\"]+\")"
172 // on win32 x86 machine regular expression is slower that std::string 20-40 times and can slowdown XML processing for several times
173 // seems that this regular expression is too slow due to many variable length parts, regexp for '&amp;'-fixing is much faster
174
175 declaredEncoding.clear();
176
177 // avoid extra large search
178 std::string strXml(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength));
179
180 size_t pos = strXml.find("<?xml");
181 if (pos == std::string::npos || pos + 6 > strXml.length() || pos > strXml.find('<'))
182 return false; // no "<?xml" declaration, "<?xml" is not first element or "<?xml" is incomplete
183
184 pos += 5; // 5 is length of "<?xml"
185
186 const size_t declLength = std::min(std::min(m_XmlDeclarationMaxLength, contentLength - pos), strXml.find('>', pos) - pos);
187 const std::string xmlDecl(xmlContent + pos, declLength);
188 const char* const xmlDeclC = xmlDecl.c_str(); // for faster processing of [] and for null-termination
189
190 static const char* const whiteSpaceChars = " \n\r\t"; // according to W3C Recommendation for XML, any of them can be used as separator
191 pos = 0;
192
193 while (pos + 12 <= declLength) // 12 is minimal length of "encoding='x'"
194 {
195 pos = xmlDecl.find_first_of(whiteSpaceChars, pos);
196 if (pos == std::string::npos)
197 return false; // no " encoding=" in declaration
198
199 pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
200 if (pos == std::string::npos)
201 return false; // no "encoding=" in declaration
202
203 if (xmlDecl.compare(pos, 8, "encoding", 8) != 0)
204 continue; // not "encoding" parameter
205 pos += 8; // length of "encoding"
206
207 if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated
208 {
209 pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
210 if (pos == std::string::npos)
211 return false; // this " encoding" is incomplete, only whitespace chars remains
212 }
213 if (xmlDeclC[pos] != '=')
214 { // "encoding" without "=", try to find other
215 pos--; // step back to whitespace
216 continue;
217 }
218
219 pos++; // skip '='
220 if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated
221 {
222 pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
223 if (pos == std::string::npos)
224 return false; // this " encoding" is incomplete, only whitespace chars remains
225 }
226 size_t encNameEndPos;
227 if (xmlDeclC[pos] == '"')
228 encNameEndPos = xmlDecl.find('"', ++pos);
229 else if (xmlDeclC[pos] == '\'')
230 encNameEndPos = xmlDecl.find('\'', ++pos);
231 else
232 continue; // no quote or double quote after 'encoding=', try to find other
233
234 if (encNameEndPos != std::string::npos)
235 {
236 declaredEncoding.assign(xmlDecl, pos, encNameEndPos - pos);
237 return true;
238 }
239 // no closing quote or double quote after 'encoding="x', try to find other
240 }
241
242 return false;
243}
244
245bool CCharsetDetection::GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding)
246{
247 supposedEncoding.clear();
248 if (contentLength < 4)
249 return false; // too little data to guess
250
251 if (xmlContent[0] == 0 && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == (char)0x3C) // '<' == '00 00 00 3C' in UCS-4 (UTF-32) big-endian
252 supposedEncoding = "UCS-4BE"; // use UCS-4 according to W3C recommendation
253 else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == 0) // '<' == '3C 00 00 00' in UCS-4 (UTF-32) little-endian
254 supposedEncoding = "UCS-4LE"; // use UCS-4 according to W3C recommendation
255 else if (xmlContent[0] == 0 && xmlContent[1] == (char)0x3C && xmlContent[2] == 0 && xmlContent[3] == (char)0x3F) // "<?" == "00 3C 00 3F" in UTF-16 (UCS-2) big-endian
256 supposedEncoding = "UTF-16BE";
257 else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == (char)0x3F && xmlContent[3] == 0) // "<?" == "3C 00 3F 00" in UTF-16 (UCS-2) little-endian
258 supposedEncoding = "UTF-16LE";
259 else if (xmlContent[0] == (char)0x4C && xmlContent[1] == (char)0x6F && xmlContent[2] == (char)0xA7 && xmlContent[3] == (char)0x94) // "<?xm" == "4C 6F A7 94" in most EBCDIC encodings
260 supposedEncoding = "EBCDIC-CP-US"; // guessed value, real value must be read from declaration
261 else
262 return false;
263
264 return true;
265}
266
267bool CCharsetDetection::ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset)
268{
269 converted.clear();
270 usedHtmlCharset.clear();
271 if (htmlContent.empty())
272 {
273 usedHtmlCharset = "UTF-8"; // any charset can be used for empty content, use UTF-8 as default
274 return false;
275 }
276
277 // this is relaxed implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#determining-the-character-encoding
278
279 // try to get charset from Byte Order Mark
280 std::string bomCharset(GetBomEncoding(htmlContent));
281 if (checkConversion(bomCharset, htmlContent, converted))
282 {
283 usedHtmlCharset = bomCharset;
284 return true;
285 }
286
287 // try charset from HTTP header (or from other out-of-band source)
288 if (checkConversion(serverReportedCharset, htmlContent, converted))
289 {
290 usedHtmlCharset = serverReportedCharset;
291 return true;
292 }
293
294 // try to find charset in HTML
295 std::string declaredCharset(GetHtmlEncodingFromHead(htmlContent));
296 if (!declaredCharset.empty())
297 {
298 if (declaredCharset.compare(0, 3, "UTF", 3) == 0)
299 declaredCharset = "UTF-8"; // charset string was found in singlebyte mode, charset can't be multibyte encoding
300 if (checkConversion(declaredCharset, htmlContent, converted))
301 {
302 usedHtmlCharset = declaredCharset;
303 return true;
304 }
305 }
306
307 // try UTF-8 if not tried before
308 if (bomCharset != "UTF-8" && serverReportedCharset != "UTF-8" && declaredCharset != "UTF-8" && checkConversion("UTF-8", htmlContent, converted))
309 {
310 usedHtmlCharset = "UTF-8";
311 return false; // only guessed value
312 }
313
314 // try user charset
315 std::string userCharset(g_langInfo.GetGuiCharSet());
316 if (checkConversion(userCharset, htmlContent, converted))
317 {
318 usedHtmlCharset = userCharset;
319 return false; // only guessed value
320 }
321
322 // try WINDOWS-1252
323 if (checkConversion("WINDOWS-1252", htmlContent, converted))
324 {
325 usedHtmlCharset = "WINDOWS-1252";
326 return false; // only guessed value
327 }
328
329 // can't find exact charset
330 // use one of detected as fallback
331 if (!bomCharset.empty())
332 usedHtmlCharset = bomCharset;
333 else if (!serverReportedCharset.empty())
334 usedHtmlCharset = serverReportedCharset;
335 else if (!declaredCharset.empty())
336 usedHtmlCharset = declaredCharset;
337 else if (!userCharset.empty())
338 usedHtmlCharset = userCharset;
339 else
340 usedHtmlCharset = "WINDOWS-1252";
341
342 CLog::Log(LOGWARNING, "%s: Can't correctly convert to UTF-8 charset, converting as \"%s\"", __FUNCTION__, usedHtmlCharset.c_str());
343 g_charsetConverter.ToUtf8(usedHtmlCharset, htmlContent, converted, false);
344
345 return false;
346}
347
348bool CCharsetDetection::ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset)
349{
350 converted.clear();
351 usedCharset.clear();
352 if (textContent.empty())
353 {
354 usedCharset = "UTF-8"; // any charset can be used for empty content, use UTF-8 as default
355 return true;
356 }
357
358 // try to get charset from Byte Order Mark
359 std::string bomCharset(GetBomEncoding(textContent));
360 if (checkConversion(bomCharset, textContent, converted))
361 {
362 usedCharset = bomCharset;
363 return true;
364 }
365
366 // try charset from HTTP header (or from other out-of-band source)
367 if (checkConversion(serverReportedCharset, textContent, converted))
368 {
369 usedCharset = serverReportedCharset;
370 return true;
371 }
372
373 // try UTF-8 if not tried before
374 if (bomCharset != "UTF-8" && serverReportedCharset != "UTF-8" && checkConversion("UTF-8", textContent, converted))
375 {
376 usedCharset = "UTF-8";
377 return true;
378 }
379
380 // try user charset
381 std::string userCharset(g_langInfo.GetGuiCharSet());
382 if (checkConversion(userCharset, textContent, converted))
383 {
384 usedCharset = userCharset;
385 return true;
386 }
387
388 // try system default charset
389 if (g_charsetConverter.systemToUtf8(textContent, converted, true))
390 {
391 usedCharset = "char"; // synonym to system charset
392 return true;
393 }
394
395 // try WINDOWS-1252
396 if (checkConversion("WINDOWS-1252", textContent, converted))
397 {
398 usedCharset = "WINDOWS-1252";
399 return true;
400 }
401
402 // can't find correct charset
403 // use one of detected as fallback
404 if (!serverReportedCharset.empty())
405 usedCharset = serverReportedCharset;
406 else if (!bomCharset.empty())
407 usedCharset = bomCharset;
408 else if (!userCharset.empty())
409 usedCharset = userCharset;
410 else
411 usedCharset = "WINDOWS-1252";
412
413 CLog::Log(LOGWARNING, "%s: Can't correctly convert to UTF-8 charset, converting as \"%s\"", __FUNCTION__, usedCharset.c_str());
414 g_charsetConverter.ToUtf8(usedCharset, textContent, converted, false);
415
416 return false;
417}
418
419
420bool CCharsetDetection::checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst)
421{
422 if (srcCharset.empty())
423 return false;
424
425 if (srcCharset != "UTF-8")
426 {
427 if (g_charsetConverter.ToUtf8(srcCharset, src, dst, true))
428 return true;
429 }
430 else if (CUtf8Utils::isValidUtf8(src))
431 {
432 dst = src;
433 return true;
434 }
435
436 return false;
437}
438
439std::string CCharsetDetection::GetHtmlEncodingFromHead(const std::string& htmlContent)
440{
441 std::string smallerHtmlContent;
442 if (htmlContent.length() > 2 * m_HtmlCharsetEndSearchPos)
443 smallerHtmlContent.assign(htmlContent, 0, 2 * m_HtmlCharsetEndSearchPos); // use twice more bytes to search for charset for safety
444
445 const std::string& html = smallerHtmlContent.empty() ? htmlContent : smallerHtmlContent; // limit search
446 const char* const htmlC = html.c_str(); // for null-termination
447 const size_t len = html.length();
448
449 // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#prescan-a-byte-stream-to-determine-its-encoding
450 // labels in comments correspond to the labels in HTML5 standard
451 // note: opposite to standard, everything is converted to uppercase instead of lower case
452 size_t pos = 0;
453 while (pos < len) // "loop" label
454 {
455 if (html.compare(pos, 4, "<!--", 4) == 0)
456 {
457 pos = html.find("-->", pos + 2);
458 if (pos == std::string::npos)
459 return "";
460 pos += 2;
461 }
462 else if (htmlC[pos] == '<' && (htmlC[pos + 1] == 'm' || htmlC[pos + 1] == 'M') && (htmlC[pos + 2] == 'e' || htmlC[pos + 2] == 'E')
463 && (htmlC[pos + 3] == 't' || htmlC[pos + 3] == 'T') && (htmlC[pos + 4] == 'a' || htmlC[pos + 4] == 'A')
464 && (htmlC[pos + 5] == 0x09 || htmlC[pos + 5] == 0x0A || htmlC[pos + 5] == 0x0C || htmlC[pos + 5] == 0x0D || htmlC[pos + 5] == 0x20 || htmlC[pos + 5] == 0x2F))
465 { // this is case insensitive "<meta" and one of tab, LF, FF, CR, space or slash
466 pos += 5; // "pos" points to symbol after "<meta"
467 std::string attrName, attrValue;
468 bool gotPragma = false;
469 std::string contentCharset;
470 do // "attributes" label
471 {
472 pos = GetHtmlAttribute(html, pos, attrName, attrValue);
473 if (attrName == "HTTP-EQUIV" && attrValue == "CONTENT-TYPE")
474 gotPragma = true;
475 else if (attrName == "CONTENT")
476 contentCharset = ExtractEncodingFromHtmlMeta(attrValue);
477 else if (attrName == "CHARSET")
478 {
479 StringUtils::Trim(attrValue, m_HtmlWhitespaceChars.c_str()); // tab, LF, FF, CR, space
480 if (!attrValue.empty())
481 return attrValue;
482 }
483 } while (!attrName.empty() && pos < len);
484
485 // "processing" label
486 if (gotPragma && !contentCharset.empty())
487 return contentCharset;
488 }
489 else if (htmlC[pos] == '<' && ((htmlC[pos + 1] >= 'A' && htmlC[pos + 1] <= 'Z') || (htmlC[pos + 1] >= 'a' && htmlC[pos + 1] <= 'z')))
490 {
491 pos = html.find_first_of("\x09\x0A\x0C\x0D >", pos); // tab, LF, FF, CR, space or '>'
492 std::string attrName, attrValue;
493 do
494 {
495 pos = GetHtmlAttribute(html, pos, attrName, attrValue);
496 } while (pos < len && !attrName.empty());
497 }
498 else if (html.compare(pos, 2, "<!", 2) == 0 || html.compare(pos, 2, "</", 2) == 0 || html.compare(pos, 2, "<?", 2) == 0)
499 pos = html.find('>', pos);
500
501 if (pos == std::string::npos)
502 return "";
503
504 // "next byte" label
505 pos++;
506 }
507
508 return ""; // no charset was found
509}
510
511size_t CCharsetDetection::GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& attrName, std::string& attrValue)
512{
513 attrName.clear();
514 attrValue.clear();
515 static const char* const htmlWhitespaceSlash = "\x09\x0A\x0C\x0D\x20\x2F"; // tab, LF, FF, CR, space or slash
516 const char* const htmlC = htmlContent.c_str();
517 const size_t len = htmlContent.length();
518
519 // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#concept-get-attributes-when-sniffing
520 // labels in comments correspond to the labels in HTML5 standard
521 // note: opposite to standard, everything is converted to uppercase instead of lower case
522 pos = htmlContent.find_first_not_of(htmlWhitespaceSlash, pos);
523 if (pos == std::string::npos || htmlC[pos] == '>')
524 return pos; // only white spaces or slashes up to the end of the htmlContent or no more attributes
525
526 while (pos < len && htmlC[pos] != '=')
527 {
528 const char chr = htmlC[pos];
529 if (chr == '/' || chr == '>')
530 return pos; // no attributes or empty attribute value
531 else if (m_HtmlWhitespaceChars.find(chr) != std::string::npos) // chr is one of whitespaces
532 {
533 pos = htmlContent.find_first_not_of(m_HtmlWhitespaceChars, pos); // "spaces" label
534 if (pos == std::string::npos || htmlC[pos] != '=')
535 return pos; // only white spaces up to the end or no attribute value
536 break;
537 }
538 else
539 appendCharAsAsciiUpperCase(attrName, chr);
540
541 pos++;
542 }
543
544 if (pos >= len)
545 return std::string::npos; // no '=', '/' or '>' were found up to the end of htmlContent
546
547 pos++; // advance pos to character after '='
548
549 pos = htmlContent.find_first_not_of(m_HtmlWhitespaceChars, pos); // "value" label
550 if (pos == std::string::npos)
551 return pos; // only white spaces remain in htmlContent
552
553 if (htmlC[pos] == '>')
554 return pos; // empty attribute value
555 else if (htmlC[pos] == '"' || htmlC[pos] == '\'')
556 {
557 const char qChr = htmlC[pos];
558 // "quote loop" label
559 while (++pos < len)
560 {
561 const char chr = htmlC[pos];
562 if (chr == qChr)
563 return pos + 1;
564 else
565 appendCharAsAsciiUpperCase(attrValue, chr);
566 }
567 return std::string::npos; // no closing quote is found
568 }
569
570 appendCharAsAsciiUpperCase(attrValue, htmlC[pos]);
571 pos++;
572
573 while (pos < len)
574 {
575 const char chr = htmlC[pos];
576 if (m_HtmlWhitespaceChars.find(chr) != std::string::npos || chr == '>')
577 return pos;
578 else
579 appendCharAsAsciiUpperCase(attrValue, chr);
580
581 pos++;
582 }
583
584 return std::string::npos; // rest of htmlContent was attribute value
585}
586
587std::string CCharsetDetection::ExtractEncodingFromHtmlMeta(std::string metaContent, size_t pos /*= 0*/)
588{
589 size_t len = metaContent.length();
590 if (pos >= len)
591 return "";
592
593 const char* const metaContentC = metaContent.c_str();
594
595 // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
596 // labels in comments correspond to the labels in HTML5 standard
597 // note: opposite to standard, case sensitive match is used as argument is always in uppercase
598 std::string charset;
599 do
600 {
601 // "loop" label
602 pos = metaContent.find("CHARSET", pos);
603 if (pos == std::string::npos)
604 return "";
605
606 pos = metaContent.find_first_not_of(m_HtmlWhitespaceChars, pos + 7); // '7' is the length of 'CHARSET'
607 if (pos != std::string::npos && metaContentC[pos] == '=')
608 {
609 pos = metaContent.find_first_not_of(m_HtmlWhitespaceChars, pos + 1);
610 if (pos != std::string::npos)
611 {
612 if (metaContentC[pos] == '\'' || metaContentC[pos] == '"')
613 {
614 const char qChr = metaContentC[pos];
615 pos++;
616 const size_t closeQpos = metaContent.find(qChr, pos);
617 if (closeQpos != std::string::npos)
618 charset.assign(metaContent, pos, closeQpos - pos);
619 }
620 else
621 charset.assign(metaContent, pos, metaContent.find("\x09\x0A\x0C\x0D ;", pos) - pos); // assign content up to the next tab, LF, FF, CR, space, semicolon or end of string
622 }
623 break;
624 }
625 } while (pos < len);
626
627 static const char* const htmlWhitespaceCharsC = m_HtmlWhitespaceChars.c_str();
628 StringUtils::Trim(charset, htmlWhitespaceCharsC);
629
630 return charset;
631}
632
633inline void CCharsetDetection::appendCharAsAsciiUpperCase(std::string& str, const char chr)
634{
635 if (chr >= 'a' && chr <= 'z')
636 str.push_back(chr - ('a' - 'A')); // convert to upper case
637 else
638 str.push_back(chr);
639}