diff options
Diffstat (limited to 'xbmc/utils/RegExp.cpp')
| -rw-r--r-- | xbmc/utils/RegExp.cpp | 642 |
1 files changed, 642 insertions, 0 deletions
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp new file mode 100644 index 0000000..b6fe9d5 --- /dev/null +++ b/xbmc/utils/RegExp.cpp | |||
| @@ -0,0 +1,642 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2005-2018 Team Kodi | ||
| 3 | * This file is part of Kodi - https://kodi.tv | ||
| 4 | * | ||
| 5 | * SPDX-License-Identifier: GPL-2.0-or-later | ||
| 6 | * See LICENSES/README.md for more information. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include "RegExp.h" | ||
| 10 | |||
| 11 | #include "log.h" | ||
| 12 | #include "utils/StringUtils.h" | ||
| 13 | #include "utils/Utf8Utils.h" | ||
| 14 | |||
| 15 | #include <algorithm> | ||
| 16 | #include <stdlib.h> | ||
| 17 | #include <string.h> | ||
| 18 | |||
| 19 | using namespace PCRE; | ||
| 20 | |||
| 21 | #ifndef PCRE_UCP | ||
| 22 | #define PCRE_UCP 0 | ||
| 23 | #endif // PCRE_UCP | ||
| 24 | |||
| 25 | #ifdef PCRE_CONFIG_JIT | ||
| 26 | #define PCRE_HAS_JIT_CODE 1 | ||
| 27 | #endif | ||
| 28 | |||
| 29 | #ifndef PCRE_STUDY_JIT_COMPILE | ||
| 30 | #define PCRE_STUDY_JIT_COMPILE 0 | ||
| 31 | #endif | ||
| 32 | #ifndef PCRE_INFO_JIT | ||
| 33 | // some unused number | ||
| 34 | #define PCRE_INFO_JIT 2048 | ||
| 35 | #endif | ||
| 36 | #ifndef PCRE_HAS_JIT_CODE | ||
| 37 | #define pcre_free_study(x) pcre_free((x)) | ||
| 38 | #endif | ||
| 39 | |||
| 40 | int CRegExp::m_Utf8Supported = -1; | ||
| 41 | int CRegExp::m_UcpSupported = -1; | ||
| 42 | int CRegExp::m_JitSupported = -1; | ||
| 43 | |||
| 44 | |||
| 45 | CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) | ||
| 46 | { | ||
| 47 | InitValues(caseless, utf8); | ||
| 48 | } | ||
| 49 | |||
| 50 | void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) | ||
| 51 | { | ||
| 52 | m_utf8Mode = utf8; | ||
| 53 | m_re = NULL; | ||
| 54 | m_sd = NULL; | ||
| 55 | m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY; | ||
| 56 | if(caseless) | ||
| 57 | m_iOptions |= PCRE_CASELESS; | ||
| 58 | if (m_utf8Mode == forceUtf8) | ||
| 59 | { | ||
| 60 | if (IsUtf8Supported()) | ||
| 61 | m_iOptions |= PCRE_UTF8; | ||
| 62 | if (AreUnicodePropertiesSupported()) | ||
| 63 | m_iOptions |= PCRE_UCP; | ||
| 64 | } | ||
| 65 | |||
| 66 | m_offset = 0; | ||
| 67 | m_jitCompiled = false; | ||
| 68 | m_bMatched = false; | ||
| 69 | m_iMatchCount = 0; | ||
| 70 | m_jitStack = NULL; | ||
| 71 | |||
| 72 | memset(m_iOvector, 0, sizeof(m_iOvector)); | ||
| 73 | } | ||
| 74 | |||
| 75 | CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/) | ||
| 76 | { | ||
| 77 | if (utf8 == autoUtf8) | ||
| 78 | utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly; | ||
| 79 | |||
| 80 | InitValues(caseless, utf8); | ||
| 81 | RegComp(re, study); | ||
| 82 | } | ||
| 83 | |||
| 84 | bool CRegExp::requireUtf8(const std::string& regexp) | ||
| 85 | { | ||
| 86 | // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences | ||
| 87 | if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string) | ||
| 88 | return true; | ||
| 89 | |||
| 90 | // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..} | ||
| 91 | // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled, | ||
| 92 | // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code | ||
| 93 | const char* const regexpC = regexp.c_str(); | ||
| 94 | const size_t len = regexp.length(); | ||
| 95 | size_t pos = 0; | ||
| 96 | |||
| 97 | while (pos < len) | ||
| 98 | { | ||
| 99 | const char chr = regexpC[pos]; | ||
| 100 | if (chr == '\\') | ||
| 101 | { | ||
| 102 | const char nextChr = regexpC[pos + 1]; | ||
| 103 | |||
| 104 | if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X') | ||
| 105 | return true; // found Unicode Properties | ||
| 106 | else if (nextChr == 'Q') | ||
| 107 | pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E" | ||
| 108 | else if (nextChr == 'x' && regexpC[pos + 2] == '{') | ||
| 109 | { // Unicode character with hex code | ||
| 110 | if (readCharXCode(regexp, pos) >= 0x100) | ||
| 111 | return true; // found Unicode character code | ||
| 112 | } | ||
| 113 | else if (nextChr == '\\' || nextChr == '(' || nextChr == ')' | ||
| 114 | || nextChr == '[' || nextChr == ']') | ||
| 115 | pos++; // exclude next character from analyze | ||
| 116 | |||
| 117 | } // chr != '\\' | ||
| 118 | else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp | ||
| 119 | pos = regexp.find(')', pos); // skip comment | ||
| 120 | else if (chr == '[') | ||
| 121 | { | ||
| 122 | if (isCharClassWithUnicode(regexp, pos)) | ||
| 123 | return true; | ||
| 124 | } | ||
| 125 | |||
| 126 | if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode | ||
| 127 | return false; | ||
| 128 | |||
| 129 | pos++; | ||
| 130 | } | ||
| 131 | |||
| 132 | // no Unicode Properties was found | ||
| 133 | return false; | ||
| 134 | } | ||
| 135 | |||
| 136 | inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos) | ||
| 137 | { | ||
| 138 | // read hex character code in form "\x{hh..}" | ||
| 139 | // 'pos' must point to '\' | ||
| 140 | if (pos >= regexp.length()) | ||
| 141 | return -1; | ||
| 142 | const char* const regexpC = regexp.c_str(); | ||
| 143 | if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{') | ||
| 144 | return -1; | ||
| 145 | |||
| 146 | pos++; | ||
| 147 | const size_t startPos = pos; // 'startPos' points to 'x' | ||
| 148 | const size_t closingBracketPos = regexp.find('}', startPos + 2); | ||
| 149 | if (closingBracketPos == std::string::npos) | ||
| 150 | return 0; // return character zero code, leave 'pos' at 'x' | ||
| 151 | |||
| 152 | pos++; // 'pos' points to '{' | ||
| 153 | int chCode = 0; | ||
| 154 | while (++pos < closingBracketPos) | ||
| 155 | { | ||
| 156 | const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]); | ||
| 157 | if (xdigitVal >= 0) | ||
| 158 | chCode = chCode * 16 + xdigitVal; | ||
| 159 | else | ||
| 160 | { // found non-hexdigit | ||
| 161 | pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code | ||
| 162 | return 0; // return character zero code | ||
| 163 | } | ||
| 164 | } | ||
| 165 | |||
| 166 | return chCode; | ||
| 167 | } | ||
| 168 | |||
| 169 | bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos) | ||
| 170 | { | ||
| 171 | const char* const regexpC = regexp.c_str(); | ||
| 172 | const size_t len = regexp.length(); | ||
| 173 | if (pos > len || regexpC[pos] != '[') | ||
| 174 | return false; | ||
| 175 | |||
| 176 | // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X" | ||
| 177 | // find end (terminating ']') of character class (like "[a-h45]") | ||
| 178 | // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]" | ||
| 179 | bool needUnicode = false; | ||
| 180 | while (++pos < len) | ||
| 181 | { | ||
| 182 | if (regexpC[pos] == '[' && regexpC[pos + 1] == ':') | ||
| 183 | { // possible POSIX character class, like "[:alpha:]" | ||
| 184 | const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class | ||
| 185 | |||
| 186 | if (nextClosingBracketPos == std::string::npos) | ||
| 187 | { // error in regexp: no closing ']' for character class | ||
| 188 | pos = std::string::npos; | ||
| 189 | return needUnicode; | ||
| 190 | } | ||
| 191 | else if (regexpC[nextClosingBracketPos - 1] == ':') | ||
| 192 | pos = nextClosingBracketPos; // skip POSIX character class | ||
| 193 | // if ":]" is not found, process "[:..." as part of normal character class | ||
| 194 | } | ||
| 195 | else if (regexpC[pos] == ']') | ||
| 196 | return needUnicode; // end of character class | ||
| 197 | else if (regexpC[pos] == '\\') | ||
| 198 | { | ||
| 199 | const char nextChar = regexpC[pos + 1]; | ||
| 200 | if (nextChar == ']' || nextChar == '[') | ||
| 201 | pos++; // skip next character | ||
| 202 | else if (nextChar == 'Q') | ||
| 203 | { | ||
| 204 | pos = regexp.find("\\E", pos + 2); | ||
| 205 | if (pos == std::string::npos) | ||
| 206 | return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class | ||
| 207 | else | ||
| 208 | pos++; // skip "\E" | ||
| 209 | } | ||
| 210 | else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X') | ||
| 211 | needUnicode = true; // don't care about property name as it can contain only ASCII chars | ||
| 212 | else if (nextChar == 'x') | ||
| 213 | { | ||
| 214 | if (readCharXCode(regexp, pos) >= 0x100) | ||
| 215 | needUnicode = true; | ||
| 216 | } | ||
| 217 | } | ||
| 218 | } | ||
| 219 | pos = std::string::npos; // closing square bracket was not found | ||
| 220 | |||
| 221 | return needUnicode; | ||
| 222 | } | ||
| 223 | |||
| 224 | |||
| 225 | CRegExp::CRegExp(const CRegExp& re) | ||
| 226 | { | ||
| 227 | m_re = NULL; | ||
| 228 | m_sd = NULL; | ||
| 229 | m_jitStack = NULL; | ||
| 230 | m_utf8Mode = re.m_utf8Mode; | ||
| 231 | m_iOptions = re.m_iOptions; | ||
| 232 | *this = re; | ||
| 233 | } | ||
| 234 | |||
| 235 | CRegExp& CRegExp::operator=(const CRegExp& re) | ||
| 236 | { | ||
| 237 | size_t size; | ||
| 238 | Cleanup(); | ||
| 239 | m_jitCompiled = false; | ||
| 240 | m_pattern = re.m_pattern; | ||
| 241 | if (re.m_re) | ||
| 242 | { | ||
| 243 | if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0) | ||
| 244 | { | ||
| 245 | if ((m_re = (pcre*)malloc(size))) | ||
| 246 | { | ||
| 247 | memcpy(m_re, re.m_re, size); | ||
| 248 | memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int)); | ||
| 249 | m_offset = re.m_offset; | ||
| 250 | m_iMatchCount = re.m_iMatchCount; | ||
| 251 | m_bMatched = re.m_bMatched; | ||
| 252 | m_subject = re.m_subject; | ||
| 253 | m_iOptions = re.m_iOptions; | ||
| 254 | } | ||
| 255 | else | ||
| 256 | CLog::Log(LOGFATAL, "%s: Failed to allocate memory", __FUNCTION__); | ||
| 257 | } | ||
| 258 | } | ||
| 259 | return *this; | ||
| 260 | } | ||
| 261 | |||
| 262 | CRegExp::~CRegExp() | ||
| 263 | { | ||
| 264 | Cleanup(); | ||
| 265 | } | ||
| 266 | |||
| 267 | bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/) | ||
| 268 | { | ||
| 269 | if (!re) | ||
| 270 | return false; | ||
| 271 | |||
| 272 | m_offset = 0; | ||
| 273 | m_jitCompiled = false; | ||
| 274 | m_bMatched = false; | ||
| 275 | m_iMatchCount = 0; | ||
| 276 | const char *errMsg = NULL; | ||
| 277 | int errOffset = 0; | ||
| 278 | int options = m_iOptions; | ||
| 279 | if (m_utf8Mode == autoUtf8 && requireUtf8(re)) | ||
| 280 | options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0); | ||
| 281 | |||
| 282 | Cleanup(); | ||
| 283 | |||
| 284 | m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL); | ||
| 285 | if (!m_re) | ||
| 286 | { | ||
| 287 | m_pattern.clear(); | ||
| 288 | CLog::Log(LOGERROR, "PCRE: %s. Compilation failed at offset %d in expression '%s'", | ||
| 289 | errMsg, errOffset, re); | ||
| 290 | return false; | ||
| 291 | } | ||
| 292 | |||
| 293 | m_pattern = re; | ||
| 294 | |||
| 295 | if (study) | ||
| 296 | { | ||
| 297 | const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported(); | ||
| 298 | const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0; | ||
| 299 | |||
| 300 | m_sd = pcre_study(m_re, studyOptions, &errMsg); | ||
| 301 | if (errMsg != NULL) | ||
| 302 | { | ||
| 303 | CLog::Log(LOGWARNING, "%s: PCRE error \"%s\" while studying expression", __FUNCTION__, errMsg); | ||
| 304 | if (m_sd != NULL) | ||
| 305 | { | ||
| 306 | pcre_free_study(m_sd); | ||
| 307 | m_sd = NULL; | ||
| 308 | } | ||
| 309 | } | ||
| 310 | else if (jitCompile) | ||
| 311 | { | ||
| 312 | int jitPresent = 0; | ||
| 313 | m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1); | ||
| 314 | } | ||
| 315 | } | ||
| 316 | |||
| 317 | return true; | ||
| 318 | } | ||
| 319 | |||
| 320 | int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/) | ||
| 321 | { | ||
| 322 | return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest); | ||
| 323 | } | ||
| 324 | |||
| 325 | int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/) | ||
| 326 | { | ||
| 327 | m_offset = 0; | ||
| 328 | m_bMatched = false; | ||
| 329 | m_iMatchCount = 0; | ||
| 330 | |||
| 331 | if (!m_re) | ||
| 332 | { | ||
| 333 | CLog::Log(LOGERROR, "PCRE: Called before compilation"); | ||
| 334 | return -1; | ||
| 335 | } | ||
| 336 | |||
| 337 | if (!str) | ||
| 338 | { | ||
| 339 | CLog::Log(LOGERROR, "PCRE: Called without a string to match"); | ||
| 340 | return -1; | ||
| 341 | } | ||
| 342 | |||
| 343 | if (startoffset > bufferLen) | ||
| 344 | { | ||
| 345 | CLog::Log(LOGERROR, "%s: startoffset is beyond end of string to match", __FUNCTION__); | ||
| 346 | return -1; | ||
| 347 | } | ||
| 348 | |||
| 349 | #ifdef PCRE_HAS_JIT_CODE | ||
| 350 | if (m_jitCompiled && !m_jitStack) | ||
| 351 | { | ||
| 352 | m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024); | ||
| 353 | if (m_jitStack == NULL) | ||
| 354 | CLog::Log(LOGWARNING, "%s: can't allocate address space for JIT stack", __FUNCTION__); | ||
| 355 | |||
| 356 | pcre_assign_jit_stack(m_sd, NULL, m_jitStack); | ||
| 357 | } | ||
| 358 | #endif | ||
| 359 | |||
| 360 | if (maxNumberOfCharsToTest >= 0) | ||
| 361 | bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest); | ||
| 362 | |||
| 363 | m_subject.assign(str + startoffset, bufferLen - startoffset); | ||
| 364 | int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT); | ||
| 365 | |||
| 366 | if (rc<1) | ||
| 367 | { | ||
| 368 | static const int fragmentLen = 80; // length of excerpt before erroneous char for log | ||
| 369 | switch(rc) | ||
| 370 | { | ||
| 371 | case PCRE_ERROR_NOMATCH: | ||
| 372 | return -1; | ||
| 373 | |||
| 374 | case PCRE_ERROR_MATCHLIMIT: | ||
| 375 | CLog::Log(LOGERROR, "PCRE: Match limit reached"); | ||
| 376 | return -1; | ||
| 377 | |||
| 378 | #ifdef PCRE_ERROR_SHORTUTF8 | ||
| 379 | case PCRE_ERROR_SHORTUTF8: | ||
| 380 | { | ||
| 381 | const size_t startPos = (m_subject.length() > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_subject.length() - fragmentLen) : 0; | ||
| 382 | if (startPos != std::string::npos) | ||
| 383 | CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string. Text before bad character: \"%s\"", m_subject.substr(startPos).c_str()); | ||
| 384 | else | ||
| 385 | CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string"); | ||
| 386 | return -1; | ||
| 387 | } | ||
| 388 | #endif | ||
| 389 | case PCRE_ERROR_BADUTF8: | ||
| 390 | { | ||
| 391 | const size_t startPos = (m_iOvector[0] > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_iOvector[0] - fragmentLen) : 0; | ||
| 392 | if (m_iOvector[0] >= 0 && startPos != std::string::npos) | ||
| 393 | CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d. Text before bad char: \"%s\"", m_iOvector[1], m_iOvector[0], m_subject.substr(startPos, m_iOvector[0] - startPos + 1).c_str()); | ||
| 394 | else | ||
| 395 | CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d", m_iOvector[1], m_iOvector[0]); | ||
| 396 | return -1; | ||
| 397 | } | ||
| 398 | case PCRE_ERROR_BADUTF8_OFFSET: | ||
| 399 | CLog::Log(LOGERROR, "PCRE: Offset is pointing to the middle of UTF-8 character"); | ||
| 400 | return -1; | ||
| 401 | |||
| 402 | default: | ||
| 403 | CLog::Log(LOGERROR, "PCRE: Unknown error: %d", rc); | ||
| 404 | return -1; | ||
| 405 | } | ||
| 406 | } | ||
| 407 | m_offset = startoffset; | ||
| 408 | m_bMatched = true; | ||
| 409 | m_iMatchCount = rc; | ||
| 410 | return m_iOvector[0] + m_offset; | ||
| 411 | } | ||
| 412 | |||
| 413 | int CRegExp::GetCaptureTotal() const | ||
| 414 | { | ||
| 415 | int c = -1; | ||
| 416 | if (m_re) | ||
| 417 | pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c); | ||
| 418 | return c; | ||
| 419 | } | ||
| 420 | |||
| 421 | std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const | ||
| 422 | { | ||
| 423 | if (!m_bMatched || sReplaceExp.empty()) | ||
| 424 | return ""; | ||
| 425 | |||
| 426 | const char* const expr = sReplaceExp.c_str(); | ||
| 427 | |||
| 428 | size_t pos = sReplaceExp.find_first_of("\\&"); | ||
| 429 | std::string result(sReplaceExp, 0, pos); | ||
| 430 | result.reserve(sReplaceExp.size()); // very rough estimate | ||
| 431 | |||
| 432 | while(pos != std::string::npos) | ||
| 433 | { | ||
| 434 | if (expr[pos] == '\\') | ||
| 435 | { | ||
| 436 | // string is null-terminated and current char isn't null, so it's safe to advance to next char | ||
| 437 | pos++; // advance to next char | ||
| 438 | const char nextChar = expr[pos]; | ||
| 439 | if (nextChar == '&' || nextChar == '\\') | ||
| 440 | { // this is "\&" or "\\" combination | ||
| 441 | result.push_back(nextChar); // add '&' or '\' to result | ||
| 442 | pos++; | ||
| 443 | } | ||
| 444 | else if (isdigit(nextChar)) | ||
| 445 | { // this is "\0" - "\9" combination | ||
| 446 | int subNum = nextChar - '0'; | ||
| 447 | pos++; // advance to second next char | ||
| 448 | const char secondNextChar = expr[pos]; | ||
| 449 | if (isdigit(secondNextChar)) | ||
| 450 | { // this is "\00" - "\99" combination | ||
| 451 | subNum = subNum * 10 + (secondNextChar - '0'); | ||
| 452 | pos++; | ||
| 453 | } | ||
| 454 | result.append(GetMatch(subNum)); | ||
| 455 | } | ||
| 456 | } | ||
| 457 | else | ||
| 458 | { // '&' char | ||
| 459 | result.append(GetMatch(0)); | ||
| 460 | pos++; | ||
| 461 | } | ||
| 462 | |||
| 463 | const size_t nextPos = sReplaceExp.find_first_of("\\&", pos); | ||
| 464 | result.append(sReplaceExp, pos, nextPos - pos); | ||
| 465 | pos = nextPos; | ||
| 466 | } | ||
| 467 | |||
| 468 | return result; | ||
| 469 | } | ||
| 470 | |||
| 471 | int CRegExp::GetSubStart(int iSub) const | ||
| 472 | { | ||
| 473 | if (!IsValidSubNumber(iSub)) | ||
| 474 | return -1; | ||
| 475 | |||
| 476 | return m_iOvector[iSub*2] + m_offset; | ||
| 477 | } | ||
| 478 | |||
| 479 | int CRegExp::GetSubStart(const std::string& subName) const | ||
| 480 | { | ||
| 481 | return GetSubStart(GetNamedSubPatternNumber(subName.c_str())); | ||
| 482 | } | ||
| 483 | |||
| 484 | int CRegExp::GetSubLength(int iSub) const | ||
| 485 | { | ||
| 486 | if (!IsValidSubNumber(iSub)) | ||
| 487 | return -1; | ||
| 488 | |||
| 489 | return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)]; | ||
| 490 | } | ||
| 491 | |||
| 492 | int CRegExp::GetSubLength(const std::string& subName) const | ||
| 493 | { | ||
| 494 | return GetSubLength(GetNamedSubPatternNumber(subName.c_str())); | ||
| 495 | } | ||
| 496 | |||
| 497 | std::string CRegExp::GetMatch(int iSub /* = 0 */) const | ||
| 498 | { | ||
| 499 | if (!IsValidSubNumber(iSub)) | ||
| 500 | return ""; | ||
| 501 | |||
| 502 | int pos = m_iOvector[(iSub*2)]; | ||
| 503 | int len = m_iOvector[(iSub*2)+1] - pos; | ||
| 504 | if (pos < 0 || len <= 0) | ||
| 505 | return ""; | ||
| 506 | |||
| 507 | return m_subject.substr(pos, len); | ||
| 508 | } | ||
| 509 | |||
| 510 | std::string CRegExp::GetMatch(const std::string& subName) const | ||
| 511 | { | ||
| 512 | return GetMatch(GetNamedSubPatternNumber(subName.c_str())); | ||
| 513 | } | ||
| 514 | |||
| 515 | bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const | ||
| 516 | { | ||
| 517 | strMatch.clear(); | ||
| 518 | int iSub = pcre_get_stringnumber(m_re, strName); | ||
| 519 | if (!IsValidSubNumber(iSub)) | ||
| 520 | return false; | ||
| 521 | strMatch = GetMatch(iSub); | ||
| 522 | return true; | ||
| 523 | } | ||
| 524 | |||
| 525 | int CRegExp::GetNamedSubPatternNumber(const char* strName) const | ||
| 526 | { | ||
| 527 | return pcre_get_stringnumber(m_re, strName); | ||
| 528 | } | ||
| 529 | |||
| 530 | void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */) | ||
| 531 | { | ||
| 532 | if (iLog < LOGDEBUG || iLog > LOGNONE) | ||
| 533 | return; | ||
| 534 | |||
| 535 | std::string str = "{"; | ||
| 536 | int size = GetSubCount(); // past the subpatterns is junk | ||
| 537 | for (int i = 0; i <= size; i++) | ||
| 538 | { | ||
| 539 | std::string t = StringUtils::Format("[%i,%i]", m_iOvector[(i*2)], m_iOvector[(i*2)+1]); | ||
| 540 | if (i != size) | ||
| 541 | t += ","; | ||
| 542 | str += t; | ||
| 543 | } | ||
| 544 | str += "}"; | ||
| 545 | CLog::Log(iLog, "regexp ovector=%s", str.c_str()); | ||
| 546 | } | ||
| 547 | |||
| 548 | void CRegExp::Cleanup() | ||
| 549 | { | ||
| 550 | if (m_re) | ||
| 551 | { | ||
| 552 | pcre_free(m_re); | ||
| 553 | m_re = NULL; | ||
| 554 | } | ||
| 555 | |||
| 556 | if (m_sd) | ||
| 557 | { | ||
| 558 | pcre_free_study(m_sd); | ||
| 559 | m_sd = NULL; | ||
| 560 | } | ||
| 561 | |||
| 562 | #ifdef PCRE_HAS_JIT_CODE | ||
| 563 | if (m_jitStack) | ||
| 564 | { | ||
| 565 | pcre_jit_stack_free(m_jitStack); | ||
| 566 | m_jitStack = NULL; | ||
| 567 | } | ||
| 568 | #endif | ||
| 569 | } | ||
| 570 | |||
| 571 | inline bool CRegExp::IsValidSubNumber(int iSub) const | ||
| 572 | { | ||
| 573 | return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences; | ||
| 574 | } | ||
| 575 | |||
| 576 | |||
| 577 | bool CRegExp::IsUtf8Supported(void) | ||
| 578 | { | ||
| 579 | if (m_Utf8Supported == -1) | ||
| 580 | { | ||
| 581 | if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0) | ||
| 582 | m_Utf8Supported = 0; | ||
| 583 | } | ||
| 584 | |||
| 585 | return m_Utf8Supported == 1; | ||
| 586 | } | ||
| 587 | |||
| 588 | bool CRegExp::AreUnicodePropertiesSupported(void) | ||
| 589 | { | ||
| 590 | #if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0 | ||
| 591 | if (m_UcpSupported == -1) | ||
| 592 | { | ||
| 593 | if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0) | ||
| 594 | m_UcpSupported = 0; | ||
| 595 | } | ||
| 596 | #endif | ||
| 597 | |||
| 598 | return m_UcpSupported == 1; | ||
| 599 | } | ||
| 600 | |||
| 601 | bool CRegExp::LogCheckUtf8Support(void) | ||
| 602 | { | ||
| 603 | bool utf8FullSupport = true; | ||
| 604 | |||
| 605 | if (!CRegExp::IsUtf8Supported()) | ||
| 606 | { | ||
| 607 | utf8FullSupport = false; | ||
| 608 | CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!"); | ||
| 609 | } | ||
| 610 | |||
| 611 | if (!CRegExp::AreUnicodePropertiesSupported()) | ||
| 612 | { | ||
| 613 | utf8FullSupport = false; | ||
| 614 | CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!"); | ||
| 615 | } | ||
| 616 | |||
| 617 | if (!utf8FullSupport) | ||
| 618 | { | ||
| 619 | CLog::Log(LOGINFO, | ||
| 620 | "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties " | ||
| 621 | "and UTF-8 support. Your PCRE lib version: %s", | ||
| 622 | PCRE::pcre_version()); | ||
| 623 | #if PCRE_UCP == 0 | ||
| 624 | CLog::Log(LOGINFO, "You will need to rebuild XBMC after PCRE lib update."); | ||
| 625 | #endif | ||
| 626 | } | ||
| 627 | |||
| 628 | return utf8FullSupport; | ||
| 629 | } | ||
| 630 | |||
| 631 | bool CRegExp::IsJitSupported(void) | ||
| 632 | { | ||
| 633 | if (m_JitSupported == -1) | ||
| 634 | { | ||
| 635 | #ifdef PCRE_HAS_JIT_CODE | ||
| 636 | if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0) | ||
| 637 | #endif | ||
| 638 | m_JitSupported = 0; | ||
| 639 | } | ||
| 640 | |||
| 641 | return m_JitSupported == 1; | ||
| 642 | } | ||
