summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/RegExp.cpp
diff options
context:
space:
mode:
authormanuel <manuel@mausz.at>2020-10-19 00:52:24 +0200
committermanuel <manuel@mausz.at>2020-10-19 00:52:24 +0200
commitbe933ef2241d79558f91796cc5b3a161f72ebf9c (patch)
treefe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/RegExp.cpp
parent5f8335c1e49ce108ef3481863833c98efa00411b (diff)
downloadkodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip
sync with upstream
Diffstat (limited to 'xbmc/utils/RegExp.cpp')
-rw-r--r--xbmc/utils/RegExp.cpp642
1 files changed, 642 insertions, 0 deletions
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp
new file mode 100644
index 0000000..b6fe9d5
--- /dev/null
+++ b/xbmc/utils/RegExp.cpp
@@ -0,0 +1,642 @@
1/*
2 * Copyright (C) 2005-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
8
9#include "RegExp.h"
10
11#include "log.h"
12#include "utils/StringUtils.h"
13#include "utils/Utf8Utils.h"
14
15#include <algorithm>
16#include <stdlib.h>
17#include <string.h>
18
19using namespace PCRE;
20
21#ifndef PCRE_UCP
22#define PCRE_UCP 0
23#endif // PCRE_UCP
24
25#ifdef PCRE_CONFIG_JIT
26#define PCRE_HAS_JIT_CODE 1
27#endif
28
29#ifndef PCRE_STUDY_JIT_COMPILE
30#define PCRE_STUDY_JIT_COMPILE 0
31#endif
32#ifndef PCRE_INFO_JIT
33// some unused number
34#define PCRE_INFO_JIT 2048
35#endif
36#ifndef PCRE_HAS_JIT_CODE
37#define pcre_free_study(x) pcre_free((x))
38#endif
39
40int CRegExp::m_Utf8Supported = -1;
41int CRegExp::m_UcpSupported = -1;
42int CRegExp::m_JitSupported = -1;
43
44
45CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
46{
47 InitValues(caseless, utf8);
48}
49
50void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
51{
52 m_utf8Mode = utf8;
53 m_re = NULL;
54 m_sd = NULL;
55 m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY;
56 if(caseless)
57 m_iOptions |= PCRE_CASELESS;
58 if (m_utf8Mode == forceUtf8)
59 {
60 if (IsUtf8Supported())
61 m_iOptions |= PCRE_UTF8;
62 if (AreUnicodePropertiesSupported())
63 m_iOptions |= PCRE_UCP;
64 }
65
66 m_offset = 0;
67 m_jitCompiled = false;
68 m_bMatched = false;
69 m_iMatchCount = 0;
70 m_jitStack = NULL;
71
72 memset(m_iOvector, 0, sizeof(m_iOvector));
73}
74
75CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/)
76{
77 if (utf8 == autoUtf8)
78 utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
79
80 InitValues(caseless, utf8);
81 RegComp(re, study);
82}
83
84bool CRegExp::requireUtf8(const std::string& regexp)
85{
86 // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
87 if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
88 return true;
89
90 // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
91 // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
92 // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
93 const char* const regexpC = regexp.c_str();
94 const size_t len = regexp.length();
95 size_t pos = 0;
96
97 while (pos < len)
98 {
99 const char chr = regexpC[pos];
100 if (chr == '\\')
101 {
102 const char nextChr = regexpC[pos + 1];
103
104 if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X')
105 return true; // found Unicode Properties
106 else if (nextChr == 'Q')
107 pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
108 else if (nextChr == 'x' && regexpC[pos + 2] == '{')
109 { // Unicode character with hex code
110 if (readCharXCode(regexp, pos) >= 0x100)
111 return true; // found Unicode character code
112 }
113 else if (nextChr == '\\' || nextChr == '(' || nextChr == ')'
114 || nextChr == '[' || nextChr == ']')
115 pos++; // exclude next character from analyze
116
117 } // chr != '\\'
118 else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
119 pos = regexp.find(')', pos); // skip comment
120 else if (chr == '[')
121 {
122 if (isCharClassWithUnicode(regexp, pos))
123 return true;
124 }
125
126 if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
127 return false;
128
129 pos++;
130 }
131
132 // no Unicode Properties was found
133 return false;
134}
135
136inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
137{
138 // read hex character code in form "\x{hh..}"
139 // 'pos' must point to '\'
140 if (pos >= regexp.length())
141 return -1;
142 const char* const regexpC = regexp.c_str();
143 if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{')
144 return -1;
145
146 pos++;
147 const size_t startPos = pos; // 'startPos' points to 'x'
148 const size_t closingBracketPos = regexp.find('}', startPos + 2);
149 if (closingBracketPos == std::string::npos)
150 return 0; // return character zero code, leave 'pos' at 'x'
151
152 pos++; // 'pos' points to '{'
153 int chCode = 0;
154 while (++pos < closingBracketPos)
155 {
156 const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
157 if (xdigitVal >= 0)
158 chCode = chCode * 16 + xdigitVal;
159 else
160 { // found non-hexdigit
161 pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
162 return 0; // return character zero code
163 }
164 }
165
166 return chCode;
167}
168
169bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
170{
171 const char* const regexpC = regexp.c_str();
172 const size_t len = regexp.length();
173 if (pos > len || regexpC[pos] != '[')
174 return false;
175
176 // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
177 // find end (terminating ']') of character class (like "[a-h45]")
178 // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
179 bool needUnicode = false;
180 while (++pos < len)
181 {
182 if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
183 { // possible POSIX character class, like "[:alpha:]"
184 const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
185
186 if (nextClosingBracketPos == std::string::npos)
187 { // error in regexp: no closing ']' for character class
188 pos = std::string::npos;
189 return needUnicode;
190 }
191 else if (regexpC[nextClosingBracketPos - 1] == ':')
192 pos = nextClosingBracketPos; // skip POSIX character class
193 // if ":]" is not found, process "[:..." as part of normal character class
194 }
195 else if (regexpC[pos] == ']')
196 return needUnicode; // end of character class
197 else if (regexpC[pos] == '\\')
198 {
199 const char nextChar = regexpC[pos + 1];
200 if (nextChar == ']' || nextChar == '[')
201 pos++; // skip next character
202 else if (nextChar == 'Q')
203 {
204 pos = regexp.find("\\E", pos + 2);
205 if (pos == std::string::npos)
206 return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
207 else
208 pos++; // skip "\E"
209 }
210 else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X')
211 needUnicode = true; // don't care about property name as it can contain only ASCII chars
212 else if (nextChar == 'x')
213 {
214 if (readCharXCode(regexp, pos) >= 0x100)
215 needUnicode = true;
216 }
217 }
218 }
219 pos = std::string::npos; // closing square bracket was not found
220
221 return needUnicode;
222}
223
224
225CRegExp::CRegExp(const CRegExp& re)
226{
227 m_re = NULL;
228 m_sd = NULL;
229 m_jitStack = NULL;
230 m_utf8Mode = re.m_utf8Mode;
231 m_iOptions = re.m_iOptions;
232 *this = re;
233}
234
235CRegExp& CRegExp::operator=(const CRegExp& re)
236{
237 size_t size;
238 Cleanup();
239 m_jitCompiled = false;
240 m_pattern = re.m_pattern;
241 if (re.m_re)
242 {
243 if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0)
244 {
245 if ((m_re = (pcre*)malloc(size)))
246 {
247 memcpy(m_re, re.m_re, size);
248 memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int));
249 m_offset = re.m_offset;
250 m_iMatchCount = re.m_iMatchCount;
251 m_bMatched = re.m_bMatched;
252 m_subject = re.m_subject;
253 m_iOptions = re.m_iOptions;
254 }
255 else
256 CLog::Log(LOGFATAL, "%s: Failed to allocate memory", __FUNCTION__);
257 }
258 }
259 return *this;
260}
261
262CRegExp::~CRegExp()
263{
264 Cleanup();
265}
266
267bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
268{
269 if (!re)
270 return false;
271
272 m_offset = 0;
273 m_jitCompiled = false;
274 m_bMatched = false;
275 m_iMatchCount = 0;
276 const char *errMsg = NULL;
277 int errOffset = 0;
278 int options = m_iOptions;
279 if (m_utf8Mode == autoUtf8 && requireUtf8(re))
280 options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
281
282 Cleanup();
283
284 m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
285 if (!m_re)
286 {
287 m_pattern.clear();
288 CLog::Log(LOGERROR, "PCRE: %s. Compilation failed at offset %d in expression '%s'",
289 errMsg, errOffset, re);
290 return false;
291 }
292
293 m_pattern = re;
294
295 if (study)
296 {
297 const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported();
298 const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0;
299
300 m_sd = pcre_study(m_re, studyOptions, &errMsg);
301 if (errMsg != NULL)
302 {
303 CLog::Log(LOGWARNING, "%s: PCRE error \"%s\" while studying expression", __FUNCTION__, errMsg);
304 if (m_sd != NULL)
305 {
306 pcre_free_study(m_sd);
307 m_sd = NULL;
308 }
309 }
310 else if (jitCompile)
311 {
312 int jitPresent = 0;
313 m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1);
314 }
315 }
316
317 return true;
318}
319
320int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/)
321{
322 return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest);
323}
324
325int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/)
326{
327 m_offset = 0;
328 m_bMatched = false;
329 m_iMatchCount = 0;
330
331 if (!m_re)
332 {
333 CLog::Log(LOGERROR, "PCRE: Called before compilation");
334 return -1;
335 }
336
337 if (!str)
338 {
339 CLog::Log(LOGERROR, "PCRE: Called without a string to match");
340 return -1;
341 }
342
343 if (startoffset > bufferLen)
344 {
345 CLog::Log(LOGERROR, "%s: startoffset is beyond end of string to match", __FUNCTION__);
346 return -1;
347 }
348
349#ifdef PCRE_HAS_JIT_CODE
350 if (m_jitCompiled && !m_jitStack)
351 {
352 m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024);
353 if (m_jitStack == NULL)
354 CLog::Log(LOGWARNING, "%s: can't allocate address space for JIT stack", __FUNCTION__);
355
356 pcre_assign_jit_stack(m_sd, NULL, m_jitStack);
357 }
358#endif
359
360 if (maxNumberOfCharsToTest >= 0)
361 bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest);
362
363 m_subject.assign(str + startoffset, bufferLen - startoffset);
364 int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT);
365
366 if (rc<1)
367 {
368 static const int fragmentLen = 80; // length of excerpt before erroneous char for log
369 switch(rc)
370 {
371 case PCRE_ERROR_NOMATCH:
372 return -1;
373
374 case PCRE_ERROR_MATCHLIMIT:
375 CLog::Log(LOGERROR, "PCRE: Match limit reached");
376 return -1;
377
378#ifdef PCRE_ERROR_SHORTUTF8
379 case PCRE_ERROR_SHORTUTF8:
380 {
381 const size_t startPos = (m_subject.length() > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_subject.length() - fragmentLen) : 0;
382 if (startPos != std::string::npos)
383 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string. Text before bad character: \"%s\"", m_subject.substr(startPos).c_str());
384 else
385 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string");
386 return -1;
387 }
388#endif
389 case PCRE_ERROR_BADUTF8:
390 {
391 const size_t startPos = (m_iOvector[0] > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_iOvector[0] - fragmentLen) : 0;
392 if (m_iOvector[0] >= 0 && startPos != std::string::npos)
393 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d. Text before bad char: \"%s\"", m_iOvector[1], m_iOvector[0], m_subject.substr(startPos, m_iOvector[0] - startPos + 1).c_str());
394 else
395 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d", m_iOvector[1], m_iOvector[0]);
396 return -1;
397 }
398 case PCRE_ERROR_BADUTF8_OFFSET:
399 CLog::Log(LOGERROR, "PCRE: Offset is pointing to the middle of UTF-8 character");
400 return -1;
401
402 default:
403 CLog::Log(LOGERROR, "PCRE: Unknown error: %d", rc);
404 return -1;
405 }
406 }
407 m_offset = startoffset;
408 m_bMatched = true;
409 m_iMatchCount = rc;
410 return m_iOvector[0] + m_offset;
411}
412
413int CRegExp::GetCaptureTotal() const
414{
415 int c = -1;
416 if (m_re)
417 pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c);
418 return c;
419}
420
421std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const
422{
423 if (!m_bMatched || sReplaceExp.empty())
424 return "";
425
426 const char* const expr = sReplaceExp.c_str();
427
428 size_t pos = sReplaceExp.find_first_of("\\&");
429 std::string result(sReplaceExp, 0, pos);
430 result.reserve(sReplaceExp.size()); // very rough estimate
431
432 while(pos != std::string::npos)
433 {
434 if (expr[pos] == '\\')
435 {
436 // string is null-terminated and current char isn't null, so it's safe to advance to next char
437 pos++; // advance to next char
438 const char nextChar = expr[pos];
439 if (nextChar == '&' || nextChar == '\\')
440 { // this is "\&" or "\\" combination
441 result.push_back(nextChar); // add '&' or '\' to result
442 pos++;
443 }
444 else if (isdigit(nextChar))
445 { // this is "\0" - "\9" combination
446 int subNum = nextChar - '0';
447 pos++; // advance to second next char
448 const char secondNextChar = expr[pos];
449 if (isdigit(secondNextChar))
450 { // this is "\00" - "\99" combination
451 subNum = subNum * 10 + (secondNextChar - '0');
452 pos++;
453 }
454 result.append(GetMatch(subNum));
455 }
456 }
457 else
458 { // '&' char
459 result.append(GetMatch(0));
460 pos++;
461 }
462
463 const size_t nextPos = sReplaceExp.find_first_of("\\&", pos);
464 result.append(sReplaceExp, pos, nextPos - pos);
465 pos = nextPos;
466 }
467
468 return result;
469}
470
471int CRegExp::GetSubStart(int iSub) const
472{
473 if (!IsValidSubNumber(iSub))
474 return -1;
475
476 return m_iOvector[iSub*2] + m_offset;
477}
478
479int CRegExp::GetSubStart(const std::string& subName) const
480{
481 return GetSubStart(GetNamedSubPatternNumber(subName.c_str()));
482}
483
484int CRegExp::GetSubLength(int iSub) const
485{
486 if (!IsValidSubNumber(iSub))
487 return -1;
488
489 return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)];
490}
491
492int CRegExp::GetSubLength(const std::string& subName) const
493{
494 return GetSubLength(GetNamedSubPatternNumber(subName.c_str()));
495}
496
497std::string CRegExp::GetMatch(int iSub /* = 0 */) const
498{
499 if (!IsValidSubNumber(iSub))
500 return "";
501
502 int pos = m_iOvector[(iSub*2)];
503 int len = m_iOvector[(iSub*2)+1] - pos;
504 if (pos < 0 || len <= 0)
505 return "";
506
507 return m_subject.substr(pos, len);
508}
509
510std::string CRegExp::GetMatch(const std::string& subName) const
511{
512 return GetMatch(GetNamedSubPatternNumber(subName.c_str()));
513}
514
515bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const
516{
517 strMatch.clear();
518 int iSub = pcre_get_stringnumber(m_re, strName);
519 if (!IsValidSubNumber(iSub))
520 return false;
521 strMatch = GetMatch(iSub);
522 return true;
523}
524
525int CRegExp::GetNamedSubPatternNumber(const char* strName) const
526{
527 return pcre_get_stringnumber(m_re, strName);
528}
529
530void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */)
531{
532 if (iLog < LOGDEBUG || iLog > LOGNONE)
533 return;
534
535 std::string str = "{";
536 int size = GetSubCount(); // past the subpatterns is junk
537 for (int i = 0; i <= size; i++)
538 {
539 std::string t = StringUtils::Format("[%i,%i]", m_iOvector[(i*2)], m_iOvector[(i*2)+1]);
540 if (i != size)
541 t += ",";
542 str += t;
543 }
544 str += "}";
545 CLog::Log(iLog, "regexp ovector=%s", str.c_str());
546}
547
548void CRegExp::Cleanup()
549{
550 if (m_re)
551 {
552 pcre_free(m_re);
553 m_re = NULL;
554 }
555
556 if (m_sd)
557 {
558 pcre_free_study(m_sd);
559 m_sd = NULL;
560 }
561
562#ifdef PCRE_HAS_JIT_CODE
563 if (m_jitStack)
564 {
565 pcre_jit_stack_free(m_jitStack);
566 m_jitStack = NULL;
567 }
568#endif
569}
570
571inline bool CRegExp::IsValidSubNumber(int iSub) const
572{
573 return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences;
574}
575
576
577bool CRegExp::IsUtf8Supported(void)
578{
579 if (m_Utf8Supported == -1)
580 {
581 if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0)
582 m_Utf8Supported = 0;
583 }
584
585 return m_Utf8Supported == 1;
586}
587
588bool CRegExp::AreUnicodePropertiesSupported(void)
589{
590#if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0
591 if (m_UcpSupported == -1)
592 {
593 if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0)
594 m_UcpSupported = 0;
595 }
596#endif
597
598 return m_UcpSupported == 1;
599}
600
601bool CRegExp::LogCheckUtf8Support(void)
602{
603 bool utf8FullSupport = true;
604
605 if (!CRegExp::IsUtf8Supported())
606 {
607 utf8FullSupport = false;
608 CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!");
609 }
610
611 if (!CRegExp::AreUnicodePropertiesSupported())
612 {
613 utf8FullSupport = false;
614 CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!");
615 }
616
617 if (!utf8FullSupport)
618 {
619 CLog::Log(LOGINFO,
620 "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties "
621 "and UTF-8 support. Your PCRE lib version: %s",
622 PCRE::pcre_version());
623#if PCRE_UCP == 0
624 CLog::Log(LOGINFO, "You will need to rebuild XBMC after PCRE lib update.");
625#endif
626 }
627
628 return utf8FullSupport;
629}
630
631bool CRegExp::IsJitSupported(void)
632{
633 if (m_JitSupported == -1)
634 {
635#ifdef PCRE_HAS_JIT_CODE
636 if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0)
637#endif
638 m_JitSupported = 0;
639 }
640
641 return m_JitSupported == 1;
642}