sync with upstream

author: manuel <manuel@mausz.at> 2020-10-19 00:52:24 +0200
committer: manuel <manuel@mausz.at> 2020-10-19 00:52:24 +0200
commit: be933ef2241d79558f91796cc5b3a161f72ebf9c (patch)
tree: fe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/RegExp.cpp
parent: 5f8335c1e49ce108ef3481863833c98efa00411b (diff)
download: kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip
1 files changed, 642 insertions, 0 deletions
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp
new file mode 100644
index 0000000..b6fe9d5
--- /dev/null
+++ b/xbmc/utils/RegExp.cpp
@@ -0,0 +1,642 @@
+/*
+ *  Copyright (C) 2005-2018 Team Kodi
+ *  This file is part of Kodi - https://kodi.tv
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ *  See LICENSES/README.md for more information.
+ */
+#include "RegExp.h"
+#include "log.h"
+#include "utils/StringUtils.h"
+#include "utils/Utf8Utils.h"
+#include <algorithm>
+#include <stdlib.h>
+#include <string.h>
+using namespace PCRE;
+#ifndef PCRE_UCP
+#define PCRE_UCP 0
+#endif // PCRE_UCP
+#ifdef PCRE_CONFIG_JIT
+#define PCRE_HAS_JIT_CODE 1
+#endif
+#ifndef PCRE_STUDY_JIT_COMPILE
+#define PCRE_STUDY_JIT_COMPILE 0
+#endif
+#ifndef PCRE_INFO_JIT
+// some unused number
+#define PCRE_INFO_JIT 2048
+#endif
+#ifndef PCRE_HAS_JIT_CODE
+#define pcre_free_study(x) pcre_free((x))
+#endif
+int CRegExp::m_Utf8Supported = -1;
+int CRegExp::m_UcpSupported  = -1;
+int CRegExp::m_JitSupported  = -1;
+CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
+{
+  InitValues(caseless, utf8);
+}
+void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
+{
+  m_utf8Mode    = utf8;
+  m_re          = NULL;
+  m_sd          = NULL;
+  m_iOptions    = PCRE_DOTALL | PCRE_NEWLINE_ANY;
+  if(caseless)
+    m_iOptions |= PCRE_CASELESS;
+  if (m_utf8Mode == forceUtf8)
+  {
+    if (IsUtf8Supported())
+      m_iOptions |= PCRE_UTF8;
+    if (AreUnicodePropertiesSupported())
+      m_iOptions |= PCRE_UCP;
+  }
+  m_offset      = 0;
+  m_jitCompiled = false;
+  m_bMatched    = false;
+  m_iMatchCount = 0;
+  m_jitStack    = NULL;
+  memset(m_iOvector, 0, sizeof(m_iOvector));
+}
+CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/)
+{
+  if (utf8 == autoUtf8)
+    utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
+  InitValues(caseless, utf8);
+  RegComp(re, study);
+}
+bool CRegExp::requireUtf8(const std::string& regexp)
+{
+  // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
+  if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
+    return true;
+  // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
+  // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
+  //       but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
+  const char* const regexpC = regexp.c_str();
+  const size_t len = regexp.length();
+  size_t pos = 0;
+  while (pos < len)
+  {
+    const char chr = regexpC[pos];
+    if (chr == '\\')
+    {
+      const char nextChr = regexpC[pos + 1];
+      if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X')
+        return true; // found Unicode Properties
+      else if (nextChr == 'Q')
+        pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
+      else if (nextChr == 'x' && regexpC[pos + 2] == '{')
+      { // Unicode character with hex code
+        if (readCharXCode(regexp, pos) >= 0x100)
+          return true; // found Unicode character code
+      }
+      else if (nextChr == '\\' || nextChr == '(' || nextChr == ')'
+               || nextChr == '[' || nextChr == ']')
+               pos++; // exclude next character from analyze
+    } // chr != '\\'
+    else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
+      pos = regexp.find(')', pos); // skip comment
+    else if (chr == '[')
+    {
+      if (isCharClassWithUnicode(regexp, pos))
+        return true;
+    }
+    if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
+      return false;
+    pos++;
+  }
+  // no Unicode Properties was found
+  return false;
+}
+inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
+{
+  // read hex character code in form "\x{hh..}"
+  // 'pos' must point to '\'
+  if (pos >= regexp.length())
+    return -1;
+  const char* const regexpC = regexp.c_str();
+  if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{')
+    return -1;
+  pos++;
+  const size_t startPos = pos; // 'startPos' points to 'x'
+  const size_t closingBracketPos = regexp.find('}', startPos + 2);
+  if (closingBracketPos == std::string::npos)
+    return 0; // return character zero code, leave 'pos' at 'x'
+  pos++; // 'pos' points to '{'
+  int chCode = 0;
+  while (++pos < closingBracketPos)
+  {
+    const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
+    if (xdigitVal >= 0)
+      chCode = chCode * 16 + xdigitVal;
+    else
+    { // found non-hexdigit
+      pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
+      return 0; // return character zero code
+    }
+  }
+  return chCode;
+}
+bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
+{
+  const char* const regexpC = regexp.c_str();
+  const size_t len = regexp.length();
+  if (pos > len || regexpC[pos] != '[')
+    return false;
+  // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
+  // find end (terminating ']') of character class (like "[a-h45]")
+  // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
+  bool needUnicode = false;
+  while (++pos < len)
+  {
+    if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
+    { // possible POSIX character class, like "[:alpha:]"
+      const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
+      if (nextClosingBracketPos == std::string::npos)
+      { // error in regexp: no closing ']' for character class
+        pos = std::string::npos;
+        return needUnicode;
+      }
+      else if (regexpC[nextClosingBracketPos - 1] == ':')
+        pos = nextClosingBracketPos; // skip POSIX character class
+      // if ":]" is not found, process "[:..." as part of normal character class
+    }
+    else if (regexpC[pos] == ']')
+      return needUnicode; // end of character class
+    else if (regexpC[pos] == '\\')
+    {
+      const char nextChar = regexpC[pos + 1];
+      if (nextChar == ']' || nextChar == '[')
+        pos++; // skip next character
+      else if (nextChar == 'Q')
+      {
+        pos = regexp.find("\\E", pos + 2);
+        if (pos == std::string::npos)
+          return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
+        else
+          pos++; // skip "\E"
+      }
+      else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X')
+        needUnicode = true; // don't care about property name as it can contain only ASCII chars
+      else if (nextChar == 'x')
+      {
+        if (readCharXCode(regexp, pos) >= 0x100)
+          needUnicode = true;
+      }
+    }
+  }
+  pos = std::string::npos; // closing square bracket was not found
+  return needUnicode;
+}
+CRegExp::CRegExp(const CRegExp& re)
+{
+  m_re = NULL;
+  m_sd = NULL;
+  m_jitStack = NULL;
+  m_utf8Mode = re.m_utf8Mode;
+  m_iOptions = re.m_iOptions;
+  *this = re;
+}
+CRegExp& CRegExp::operator=(const CRegExp& re)
+{
+  size_t size;
+  Cleanup();
+  m_jitCompiled = false;
+  m_pattern = re.m_pattern;
+  if (re.m_re)
+  {
+    if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0)
+    {
+      if ((m_re = (pcre*)malloc(size)))
+      {
+        memcpy(m_re, re.m_re, size);
+        memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int));
+        m_offset = re.m_offset;
+        m_iMatchCount = re.m_iMatchCount;
+        m_bMatched = re.m_bMatched;
+        m_subject = re.m_subject;
+        m_iOptions = re.m_iOptions;
+      }
+      else
+        CLog::Log(LOGFATAL, "%s: Failed to allocate memory", __FUNCTION__);
+    }
+  }
+  return *this;
+}
+CRegExp::~CRegExp()
+{
+  Cleanup();
+}
+bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
+{
+  if (!re)
+    return false;
+  m_offset           = 0;
+  m_jitCompiled      = false;
+  m_bMatched         = false;
+  m_iMatchCount      = 0;
+  const char *errMsg = NULL;
+  int errOffset      = 0;
+  int options        = m_iOptions;
+  if (m_utf8Mode == autoUtf8 && requireUtf8(re))
+    options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
+  Cleanup();
+  m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
+  if (!m_re)
+  {
+    m_pattern.clear();
+    CLog::Log(LOGERROR, "PCRE: %s. Compilation failed at offset %d in expression '%s'",
+              errMsg, errOffset, re);
+    return false;
+  }
+  m_pattern = re;
+  if (study)
+  {
+    const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported();
+    const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0;
+    m_sd = pcre_study(m_re, studyOptions, &errMsg);
+    if (errMsg != NULL)
+    {
+      CLog::Log(LOGWARNING, "%s: PCRE error \"%s\" while studying expression", __FUNCTION__, errMsg);
+      if (m_sd != NULL)
+      {
+        pcre_free_study(m_sd);
+        m_sd = NULL;
+      }
+    }
+    else if (jitCompile)
+    {
+      int jitPresent = 0;
+      m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1);
+    }
+  }
+  return true;
+}
+int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/)
+{
+  return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest);
+}
+int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/)
+{
+  m_offset      = 0;
+  m_bMatched    = false;
+  m_iMatchCount = 0;
+  if (!m_re)
+  {
+    CLog::Log(LOGERROR, "PCRE: Called before compilation");
+    return -1;
+  }
+  if (!str)
+  {
+    CLog::Log(LOGERROR, "PCRE: Called without a string to match");
+    return -1;
+  }
+  if (startoffset > bufferLen)
+  {
+    CLog::Log(LOGERROR, "%s: startoffset is beyond end of string to match", __FUNCTION__);
+    return -1;
+  }
+#ifdef PCRE_HAS_JIT_CODE
+  if (m_jitCompiled && !m_jitStack)
+  {
+    m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024);
+    if (m_jitStack == NULL)
+      CLog::Log(LOGWARNING, "%s: can't allocate address space for JIT stack", __FUNCTION__);
+    pcre_assign_jit_stack(m_sd, NULL, m_jitStack);
+  }
+#endif
+  if (maxNumberOfCharsToTest >= 0)
+    bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest);
+  m_subject.assign(str + startoffset, bufferLen - startoffset);
+  int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT);
+  if (rc<1)
+  {
+    static const int fragmentLen = 80; // length of excerpt before erroneous char for log
+    switch(rc)
+    {
+    case PCRE_ERROR_NOMATCH:
+      return -1;
+    case PCRE_ERROR_MATCHLIMIT:
+      CLog::Log(LOGERROR, "PCRE: Match limit reached");
+      return -1;
+#ifdef PCRE_ERROR_SHORTUTF8
+    case PCRE_ERROR_SHORTUTF8:
+      {
+        const size_t startPos = (m_subject.length() > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_subject.length() - fragmentLen) : 0;
+        if (startPos != std::string::npos)
+          CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string. Text before bad character: \"%s\"", m_subject.substr(startPos).c_str());
+        else
+          CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string");
+        return -1;
+      }
+#endif
+    case PCRE_ERROR_BADUTF8:
+      {
+        const size_t startPos = (m_iOvector[0] > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_iOvector[0] - fragmentLen) : 0;
+        if (m_iOvector[0] >= 0 && startPos != std::string::npos)
+          CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d. Text before bad char: \"%s\"", m_iOvector[1], m_iOvector[0], m_subject.substr(startPos, m_iOvector[0] - startPos + 1).c_str());
+        else
+          CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d", m_iOvector[1], m_iOvector[0]);
+        return -1;
+      }
+    case PCRE_ERROR_BADUTF8_OFFSET:
+      CLog::Log(LOGERROR, "PCRE: Offset is pointing to the middle of UTF-8 character");
+      return -1;
+    default:
+      CLog::Log(LOGERROR, "PCRE: Unknown error: %d", rc);
+      return -1;
+    }
+  }
+  m_offset = startoffset;
+  m_bMatched = true;
+  m_iMatchCount = rc;
+  return m_iOvector[0] + m_offset;
+}
+int CRegExp::GetCaptureTotal() const
+{
+  int c = -1;
+  if (m_re)
+    pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c);
+  return c;
+}
+std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const
+{
+  if (!m_bMatched || sReplaceExp.empty())
+    return "";
+  const char* const expr = sReplaceExp.c_str();
+  size_t pos = sReplaceExp.find_first_of("\\&");
+  std::string result(sReplaceExp, 0, pos);
+  result.reserve(sReplaceExp.size()); // very rough estimate
+  while(pos != std::string::npos)
+  {
+    if (expr[pos] == '\\')
+    {
+      // string is null-terminated and current char isn't null, so it's safe to advance to next char
+      pos++; // advance to next char
+      const char nextChar = expr[pos];
+      if (nextChar == '&' || nextChar == '\\')
+      { // this is "\&" or "\\" combination
+        result.push_back(nextChar); // add '&' or '\' to result
+        pos++;
+      }
+      else if (isdigit(nextChar))
+      { // this is "\0" - "\9" combination
+        int subNum = nextChar - '0';
+        pos++; // advance to second next char
+        const char secondNextChar = expr[pos];
+        if (isdigit(secondNextChar))
+        { // this is "\00" - "\99" combination
+          subNum = subNum * 10 + (secondNextChar - '0');
+          pos++;
+        }
+        result.append(GetMatch(subNum));
+      }
+    }
+    else
+    { // '&' char
+      result.append(GetMatch(0));
+      pos++;
+    }
+    const size_t nextPos = sReplaceExp.find_first_of("\\&", pos);
+    result.append(sReplaceExp, pos, nextPos - pos);
+    pos = nextPos;
+  }
+  return result;
+}
+int CRegExp::GetSubStart(int iSub) const
+{
+  if (!IsValidSubNumber(iSub))
+    return -1;
+  return m_iOvector[iSub*2] + m_offset;
+}
+int CRegExp::GetSubStart(const std::string& subName) const
+{
+  return GetSubStart(GetNamedSubPatternNumber(subName.c_str()));
+}
+int CRegExp::GetSubLength(int iSub) const
+{
+  if (!IsValidSubNumber(iSub))
+    return -1;
+  return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)];
+}
+int CRegExp::GetSubLength(const std::string& subName) const
+{
+  return GetSubLength(GetNamedSubPatternNumber(subName.c_str()));
+}
+std::string CRegExp::GetMatch(int iSub /* = 0 */) const
+{
+  if (!IsValidSubNumber(iSub))
+    return "";
+  int pos = m_iOvector[(iSub*2)];
+  int len = m_iOvector[(iSub*2)+1] - pos;
+  if (pos < 0 || len <= 0)
+    return "";
+  return m_subject.substr(pos, len);
+}
+std::string CRegExp::GetMatch(const std::string& subName) const
+{
+  return GetMatch(GetNamedSubPatternNumber(subName.c_str()));
+}
+bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const
+{
+  strMatch.clear();
+  int iSub = pcre_get_stringnumber(m_re, strName);
+  if (!IsValidSubNumber(iSub))
+    return false;
+  strMatch = GetMatch(iSub);
+  return true;
+}
+int CRegExp::GetNamedSubPatternNumber(const char* strName) const
+{
+  return pcre_get_stringnumber(m_re, strName);
+}
+void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */)
+{
+  if (iLog < LOGDEBUG || iLog > LOGNONE)
+    return;
+  std::string str = "{";
+  int size = GetSubCount(); // past the subpatterns is junk
+  for (int i = 0; i <= size; i++)
+  {
+    std::string t = StringUtils::Format("[%i,%i]", m_iOvector[(i*2)], m_iOvector[(i*2)+1]);
+    if (i != size)
+      t += ",";
+    str += t;
+  }
+  str += "}";
+  CLog::Log(iLog, "regexp ovector=%s", str.c_str());
+}
+void CRegExp::Cleanup()
+{
+  if (m_re)
+  {
+    pcre_free(m_re);
+    m_re = NULL;
+  }
+  if (m_sd)
+  {
+    pcre_free_study(m_sd);
+    m_sd = NULL;
+  }
+#ifdef PCRE_HAS_JIT_CODE
+  if (m_jitStack)
+  {
+    pcre_jit_stack_free(m_jitStack);
+    m_jitStack = NULL;
+  }
+#endif
+}
+inline bool CRegExp::IsValidSubNumber(int iSub) const
+{
+  return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences;
+}
+bool CRegExp::IsUtf8Supported(void)
+{
+  if (m_Utf8Supported == -1)
+  {
+    if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0)
+      m_Utf8Supported = 0;
+  }
+  return m_Utf8Supported == 1;
+}
+bool CRegExp::AreUnicodePropertiesSupported(void)
+{
+#if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0
+  if (m_UcpSupported == -1)
+  {
+    if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0)
+      m_UcpSupported = 0;
+  }
+#endif
+  return m_UcpSupported == 1;
+}
+bool CRegExp::LogCheckUtf8Support(void)
+{
+  bool utf8FullSupport = true;
+  if (!CRegExp::IsUtf8Supported())
+  {
+    utf8FullSupport = false;
+    CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!");
+  }
+  if (!CRegExp::AreUnicodePropertiesSupported())
+  {
+    utf8FullSupport = false;
+    CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!");
+  }
+  if (!utf8FullSupport)
+  {
+    CLog::Log(LOGINFO,
+              "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties "
+              "and UTF-8 support. Your PCRE lib version: %s",
+              PCRE::pcre_version());
+#if PCRE_UCP == 0
+    CLog::Log(LOGINFO, "You will need to rebuild XBMC after PCRE lib update.");
+#endif
+  }
+  return utf8FullSupport;
+}
+bool CRegExp::IsJitSupported(void)
+{
+  if (m_JitSupported == -1)
+  {
+#ifdef PCRE_HAS_JIT_CODE
+    if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0)
+#endif
+      m_JitSupported = 0;
+  }
+  return m_JitSupported == 1;
+}
author	manuel <manuel@mausz.at>	2020-10-19 00:52:24 +0200
committer	manuel <manuel@mausz.at>	2020-10-19 00:52:24 +0200
commit	be933ef2241d79558f91796cc5b3a161f72ebf9c (patch)
tree	fe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/RegExp.cpp
parent	5f8335c1e49ce108ef3481863833c98efa00411b (diff)
download	kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2 kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip

diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp new file mode 100644 index 0000000..b6fe9d5 --- /dev/null +++ b/xbmc/utils/RegExp.cpp
@@ -0,0 +1,642 @@
	1	/*
	2	* Copyright (C) 2005-2018 Team Kodi
	3	* This file is part of Kodi - https://kodi.tv
	4	*
	5	* SPDX-License-Identifier: GPL-2.0-or-later
	6	* See LICENSES/README.md for more information.
	7	*/
	8
	9	#include "RegExp.h"
	10
	11	#include "log.h"
	12	#include "utils/StringUtils.h"
	13	#include "utils/Utf8Utils.h"
	14
	15	#include <algorithm>
	16	#include <stdlib.h>
	17	#include <string.h>
	18
	19	using namespace PCRE;
	20
	21	#ifndef PCRE_UCP
	22	#define PCRE_UCP 0
	23	#endif // PCRE_UCP
	24
	25	#ifdef PCRE_CONFIG_JIT
	26	#define PCRE_HAS_JIT_CODE 1
	27	#endif
	28
	29	#ifndef PCRE_STUDY_JIT_COMPILE
	30	#define PCRE_STUDY_JIT_COMPILE 0
	31	#endif
	32	#ifndef PCRE_INFO_JIT
	33	// some unused number
	34	#define PCRE_INFO_JIT 2048
	35	#endif
	36	#ifndef PCRE_HAS_JIT_CODE
	37	#define pcre_free_study(x) pcre_free((x))
	38	#endif
	39
	40	int CRegExp::m_Utf8Supported = -1;
	41	int CRegExp::m_UcpSupported = -1;
	42	int CRegExp::m_JitSupported = -1;
	43
	44
	45	CRegExp::CRegExp(bool caseless /= false/, CRegExp::utf8Mode utf8 /= asciiOnly/)
	46	{
	47	InitValues(caseless, utf8);
	48	}
	49
	50	void CRegExp::InitValues(bool caseless /= false/, CRegExp::utf8Mode utf8 /= asciiOnly/)
	51	{
	52	m_utf8Mode = utf8;
	53	m_re = NULL;
	54	m_sd = NULL;
	55	m_iOptions = PCRE_DOTALL \| PCRE_NEWLINE_ANY;
	56	if(caseless)
	57	m_iOptions \|= PCRE_CASELESS;
	58	if (m_utf8Mode == forceUtf8)
	59	{
	60	if (IsUtf8Supported())
	61	m_iOptions \|= PCRE_UTF8;
	62	if (AreUnicodePropertiesSupported())
	63	m_iOptions \|= PCRE_UCP;
	64	}
	65
	66	m_offset = 0;
	67	m_jitCompiled = false;
	68	m_bMatched = false;
	69	m_iMatchCount = 0;
	70	m_jitStack = NULL;
	71
	72	memset(m_iOvector, 0, sizeof(m_iOvector));
	73	}
	74
	75	CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char re, studyMode study /= NoStudy*/)
	76	{
	77	if (utf8 == autoUtf8)
	78	utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
	79
	80	InitValues(caseless, utf8);
	81	RegComp(re, study);
	82	}
	83
	84	bool CRegExp::requireUtf8(const std::string& regexp)
	85	{
	86	// enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
	87	if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
	88	return true;
	89
	90	// check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
	91	// note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
	92	// but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
	93	const char* const regexpC = regexp.c_str();
	94	const size_t len = regexp.length();
	95	size_t pos = 0;
	96
	97	while (pos < len)
	98	{
	99	const char chr = regexpC[pos];
	100	if (chr == '\\')
	101	{
	102	const char nextChr = regexpC[pos + 1];
	103
	104	if (nextChr == 'p' \|\| nextChr == 'P' \|\| nextChr == 'X')
	105	return true; // found Unicode Properties
	106	else if (nextChr == 'Q')
	107	pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
	108	else if (nextChr == 'x' && regexpC[pos + 2] == '{')
	109	{ // Unicode character with hex code
	110	if (readCharXCode(regexp, pos) >= 0x100)
	111	return true; // found Unicode character code
	112	}
	113	else if (nextChr == '\\' \|\| nextChr == '(' \|\| nextChr == ')'
	114	\|\| nextChr == '[' \|\| nextChr == ']')
	115	pos++; // exclude next character from analyze
	116
	117	} // chr != '\\'
	118	else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
	119	pos = regexp.find(')', pos); // skip comment
	120	else if (chr == '[')
	121	{
	122	if (isCharClassWithUnicode(regexp, pos))
	123	return true;
	124	}
	125
	126	if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
	127	return false;
	128
	129	pos++;
	130	}
	131
	132	// no Unicode Properties was found
	133	return false;
	134	}
	135
	136	inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
	137	{
	138	// read hex character code in form "\x{hh..}"
	139	// 'pos' must point to '\'
	140	if (pos >= regexp.length())
	141	return -1;
	142	const char* const regexpC = regexp.c_str();
	143	if (regexpC[pos] != '\\' \|\| regexpC[pos + 1] != 'x' \|\| regexpC[pos + 2] != '{')
	144	return -1;
	145
	146	pos++;
	147	const size_t startPos = pos; // 'startPos' points to 'x'
	148	const size_t closingBracketPos = regexp.find('}', startPos + 2);
	149	if (closingBracketPos == std::string::npos)
	150	return 0; // return character zero code, leave 'pos' at 'x'
	151
	152	pos++; // 'pos' points to '{'
	153	int chCode = 0;
	154	while (++pos < closingBracketPos)
	155	{
	156	const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
	157	if (xdigitVal >= 0)
	158	chCode = chCode * 16 + xdigitVal;
	159	else
	160	{ // found non-hexdigit
	161	pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
	162	return 0; // return character zero code
	163	}
	164	}
	165
	166	return chCode;
	167	}
	168
	169	bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
	170	{
	171	const char* const regexpC = regexp.c_str();
	172	const size_t len = regexp.length();
	173	if (pos > len \|\| regexpC[pos] != '[')
	174	return false;
	175
	176	// look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
	177	// find end (terminating ']') of character class (like "[a-h45]")
	178	// detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
	179	bool needUnicode = false;
	180	while (++pos < len)
	181	{
	182	if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
	183	{ // possible POSIX character class, like "[:alpha:]"
	184	const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
	185
	186	if (nextClosingBracketPos == std::string::npos)
	187	{ // error in regexp: no closing ']' for character class
	188	pos = std::string::npos;
	189	return needUnicode;
	190	}
	191	else if (regexpC[nextClosingBracketPos - 1] == ':')
	192	pos = nextClosingBracketPos; // skip POSIX character class
	193	// if ":]" is not found, process "[:..." as part of normal character class
	194	}
	195	else if (regexpC[pos] == ']')
	196	return needUnicode; // end of character class
	197	else if (regexpC[pos] == '\\')
	198	{
	199	const char nextChar = regexpC[pos + 1];
	200	if (nextChar == ']' \|\| nextChar == '[')
	201	pos++; // skip next character
	202	else if (nextChar == 'Q')
	203	{
	204	pos = regexp.find("\\E", pos + 2);
	205	if (pos == std::string::npos)
	206	return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
	207	else
	208	pos++; // skip "\E"
	209	}
	210	else if (nextChar == 'p' \|\| nextChar == 'P' \|\| nextChar == 'X')
	211	needUnicode = true; // don't care about property name as it can contain only ASCII chars
	212	else if (nextChar == 'x')
	213	{
	214	if (readCharXCode(regexp, pos) >= 0x100)
	215	needUnicode = true;
	216	}
	217	}
	218	}
	219	pos = std::string::npos; // closing square bracket was not found
	220
	221	return needUnicode;
	222	}
	223
	224
	225	CRegExp::CRegExp(const CRegExp& re)
	226	{
	227	m_re = NULL;
	228	m_sd = NULL;
	229	m_jitStack = NULL;
	230	m_utf8Mode = re.m_utf8Mode;
	231	m_iOptions = re.m_iOptions;
	232	*this = re;
	233	}
	234
	235	CRegExp& CRegExp::operator=(const CRegExp& re)
	236	{
	237	size_t size;
	238	Cleanup();
	239	m_jitCompiled = false;
	240	m_pattern = re.m_pattern;
	241	if (re.m_re)
	242	{
	243	if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0)
	244	{
	245	if ((m_re = (pcre*)malloc(size)))
	246	{
	247	memcpy(m_re, re.m_re, size);
	248	memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int));
	249	m_offset = re.m_offset;
	250	m_iMatchCount = re.m_iMatchCount;
	251	m_bMatched = re.m_bMatched;
	252	m_subject = re.m_subject;
	253	m_iOptions = re.m_iOptions;
	254	}
	255	else
	256	CLog::Log(LOGFATAL, "%s: Failed to allocate memory", __FUNCTION__);
	257	}
	258	}
	259	return *this;
	260	}
	261
	262	CRegExp::~CRegExp()
	263	{
	264	Cleanup();
	265	}
	266
	267	bool CRegExp::RegComp(const char re, studyMode study /= NoStudy*/)
	268	{
	269	if (!re)
	270	return false;
	271
	272	m_offset = 0;
	273	m_jitCompiled = false;
	274	m_bMatched = false;
	275	m_iMatchCount = 0;
	276	const char *errMsg = NULL;
	277	int errOffset = 0;
	278	int options = m_iOptions;
	279	if (m_utf8Mode == autoUtf8 && requireUtf8(re))
	280	options \|= (IsUtf8Supported() ? PCRE_UTF8 : 0) \| (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
	281
	282	Cleanup();
	283
	284	m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
	285	if (!m_re)
	286	{
	287	m_pattern.clear();
	288	CLog::Log(LOGERROR, "PCRE: %s. Compilation failed at offset %d in expression '%s'",
	289	errMsg, errOffset, re);
	290	return false;
	291	}
	292
	293	m_pattern = re;
	294
	295	if (study)
	296	{
	297	const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported();
	298	const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0;
	299
	300	m_sd = pcre_study(m_re, studyOptions, &errMsg);
	301	if (errMsg != NULL)
	302	{
	303	CLog::Log(LOGWARNING, "%s: PCRE error \"%s\" while studying expression", __FUNCTION__, errMsg);
	304	if (m_sd != NULL)
	305	{
	306	pcre_free_study(m_sd);
	307	m_sd = NULL;
	308	}
	309	}
	310	else if (jitCompile)
	311	{
	312	int jitPresent = 0;
	313	m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1);
	314	}
	315	}
	316
	317	return true;
	318	}
	319
	320	int CRegExp::RegFind(const char str, unsigned int startoffset /= 0/, int maxNumberOfCharsToTest /= -1*/)
	321	{
	322	return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest);
	323	}
	324
	325	int CRegExp::PrivateRegFind(size_t bufferLen, const char str, unsigned int startoffset / = 0/, int maxNumberOfCharsToTest /= -1*/)
	326	{
	327	m_offset = 0;
	328	m_bMatched = false;
	329	m_iMatchCount = 0;
	330
	331	if (!m_re)
	332	{
	333	CLog::Log(LOGERROR, "PCRE: Called before compilation");
	334	return -1;
	335	}
	336
	337	if (!str)
	338	{
	339	CLog::Log(LOGERROR, "PCRE: Called without a string to match");
	340	return -1;
	341	}
	342
	343	if (startoffset > bufferLen)
	344	{
	345	CLog::Log(LOGERROR, "%s: startoffset is beyond end of string to match", __FUNCTION__);
	346	return -1;
	347	}
	348
	349	#ifdef PCRE_HAS_JIT_CODE
	350	if (m_jitCompiled && !m_jitStack)
	351	{
	352	m_jitStack = pcre_jit_stack_alloc(321024, 5121024);
	353	if (m_jitStack == NULL)
	354	CLog::Log(LOGWARNING, "%s: can't allocate address space for JIT stack", __FUNCTION__);
	355
	356	pcre_assign_jit_stack(m_sd, NULL, m_jitStack);
	357	}
	358	#endif
	359
	360	if (maxNumberOfCharsToTest >= 0)
	361	bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest);
	362
	363	m_subject.assign(str + startoffset, bufferLen - startoffset);
	364	int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT);
	365
	366	if (rc<1)
	367	{
	368	static const int fragmentLen = 80; // length of excerpt before erroneous char for log
	369	switch(rc)
	370	{
	371	case PCRE_ERROR_NOMATCH:
	372	return -1;
	373
	374	case PCRE_ERROR_MATCHLIMIT:
	375	CLog::Log(LOGERROR, "PCRE: Match limit reached");
	376	return -1;
	377
	378	#ifdef PCRE_ERROR_SHORTUTF8
	379	case PCRE_ERROR_SHORTUTF8:
	380	{
	381	const size_t startPos = (m_subject.length() > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_subject.length() - fragmentLen) : 0;
	382	if (startPos != std::string::npos)
	383	CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string. Text before bad character: \"%s\"", m_subject.substr(startPos).c_str());
	384	else
	385	CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string");
	386	return -1;
	387	}
	388	#endif
	389	case PCRE_ERROR_BADUTF8:
	390	{
	391	const size_t startPos = (m_iOvector[0] > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_iOvector[0] - fragmentLen) : 0;
	392	if (m_iOvector[0] >= 0 && startPos != std::string::npos)
	393	CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d. Text before bad char: \"%s\"", m_iOvector[1], m_iOvector[0], m_subject.substr(startPos, m_iOvector[0] - startPos + 1).c_str());
	394	else
	395	CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d", m_iOvector[1], m_iOvector[0]);
	396	return -1;
	397	}
	398	case PCRE_ERROR_BADUTF8_OFFSET:
	399	CLog::Log(LOGERROR, "PCRE: Offset is pointing to the middle of UTF-8 character");
	400	return -1;
	401
	402	default:
	403	CLog::Log(LOGERROR, "PCRE: Unknown error: %d", rc);
	404	return -1;
	405	}
	406	}
	407	m_offset = startoffset;
	408	m_bMatched = true;
	409	m_iMatchCount = rc;
	410	return m_iOvector[0] + m_offset;
	411	}
	412
	413	int CRegExp::GetCaptureTotal() const
	414	{
	415	int c = -1;
	416	if (m_re)
	417	pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c);
	418	return c;
	419	}
	420
	421	std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const
	422	{
	423	if (!m_bMatched \|\| sReplaceExp.empty())
	424	return "";
	425
	426	const char* const expr = sReplaceExp.c_str();
	427
	428	size_t pos = sReplaceExp.find_first_of("\\&");
	429	std::string result(sReplaceExp, 0, pos);
	430	result.reserve(sReplaceExp.size()); // very rough estimate
	431
	432	while(pos != std::string::npos)
	433	{
	434	if (expr[pos] == '\\')
	435	{
	436	// string is null-terminated and current char isn't null, so it's safe to advance to next char
	437	pos++; // advance to next char
	438	const char nextChar = expr[pos];
	439	if (nextChar == '&' \|\| nextChar == '\\')
	440	{ // this is "\&" or "\\" combination
	441	result.push_back(nextChar); // add '&' or '\' to result
	442	pos++;
	443	}
	444	else if (isdigit(nextChar))
	445	{ // this is "\0" - "\9" combination
	446	int subNum = nextChar - '0';
	447	pos++; // advance to second next char
	448	const char secondNextChar = expr[pos];
	449	if (isdigit(secondNextChar))
	450	{ // this is "\00" - "\99" combination
	451	subNum = subNum * 10 + (secondNextChar - '0');
	452	pos++;
	453	}
	454	result.append(GetMatch(subNum));
	455	}
	456	}
	457	else
	458	{ // '&' char
	459	result.append(GetMatch(0));
	460	pos++;
	461	}
	462
	463	const size_t nextPos = sReplaceExp.find_first_of("\\&", pos);
	464	result.append(sReplaceExp, pos, nextPos - pos);
	465	pos = nextPos;
	466	}
	467
	468	return result;
	469	}
	470
	471	int CRegExp::GetSubStart(int iSub) const
	472	{
	473	if (!IsValidSubNumber(iSub))
	474	return -1;
	475
	476	return m_iOvector[iSub*2] + m_offset;
	477	}
	478
	479	int CRegExp::GetSubStart(const std::string& subName) const
	480	{
	481	return GetSubStart(GetNamedSubPatternNumber(subName.c_str()));
	482	}
	483
	484	int CRegExp::GetSubLength(int iSub) const
	485	{
	486	if (!IsValidSubNumber(iSub))
	487	return -1;
	488
	489	return m_iOvector[(iSub2)+1] - m_iOvector[(iSub2)];
	490	}
	491
	492	int CRegExp::GetSubLength(const std::string& subName) const
	493	{
	494	return GetSubLength(GetNamedSubPatternNumber(subName.c_str()));
	495	}
	496
	497	std::string CRegExp::GetMatch(int iSub /* = 0 */) const
	498	{
	499	if (!IsValidSubNumber(iSub))
	500	return "";
	501
	502	int pos = m_iOvector[(iSub*2)];
	503	int len = m_iOvector[(iSub*2)+1] - pos;
	504	if (pos < 0 \|\| len <= 0)
	505	return "";
	506
	507	return m_subject.substr(pos, len);
	508	}
	509
	510	std::string CRegExp::GetMatch(const std::string& subName) const
	511	{
	512	return GetMatch(GetNamedSubPatternNumber(subName.c_str()));
	513	}
	514
	515	bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const
	516	{
	517	strMatch.clear();
	518	int iSub = pcre_get_stringnumber(m_re, strName);
	519	if (!IsValidSubNumber(iSub))
	520	return false;
	521	strMatch = GetMatch(iSub);
	522	return true;
	523	}
	524
	525	int CRegExp::GetNamedSubPatternNumber(const char* strName) const
	526	{
	527	return pcre_get_stringnumber(m_re, strName);
	528	}
	529
	530	void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */)
	531	{
	532	if (iLog < LOGDEBUG \|\| iLog > LOGNONE)
	533	return;
	534
	535	std::string str = "{";
	536	int size = GetSubCount(); // past the subpatterns is junk
	537	for (int i = 0; i <= size; i++)
	538	{
	539	std::string t = StringUtils::Format("[%i,%i]", m_iOvector[(i2)], m_iOvector[(i2)+1]);
	540	if (i != size)
	541	t += ",";
	542	str += t;
	543	}
	544	str += "}";
	545	CLog::Log(iLog, "regexp ovector=%s", str.c_str());
	546	}
	547
	548	void CRegExp::Cleanup()
	549	{
	550	if (m_re)
	551	{
	552	pcre_free(m_re);
	553	m_re = NULL;
	554	}
	555
	556	if (m_sd)
	557	{
	558	pcre_free_study(m_sd);
	559	m_sd = NULL;
	560	}
	561
	562	#ifdef PCRE_HAS_JIT_CODE
	563	if (m_jitStack)
	564	{
	565	pcre_jit_stack_free(m_jitStack);
	566	m_jitStack = NULL;
	567	}
	568	#endif
	569	}
	570
	571	inline bool CRegExp::IsValidSubNumber(int iSub) const
	572	{
	573	return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences;
	574	}
	575
	576
	577	bool CRegExp::IsUtf8Supported(void)
	578	{
	579	if (m_Utf8Supported == -1)
	580	{
	581	if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0)
	582	m_Utf8Supported = 0;
	583	}
	584
	585	return m_Utf8Supported == 1;
	586	}
	587
	588	bool CRegExp::AreUnicodePropertiesSupported(void)
	589	{
	590	#if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0
	591	if (m_UcpSupported == -1)
	592	{
	593	if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0)
	594	m_UcpSupported = 0;
	595	}
	596	#endif
	597
	598	return m_UcpSupported == 1;
	599	}
	600
	601	bool CRegExp::LogCheckUtf8Support(void)
	602	{
	603	bool utf8FullSupport = true;
	604
	605	if (!CRegExp::IsUtf8Supported())
	606	{
	607	utf8FullSupport = false;
	608	CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!");
	609	}
	610
	611	if (!CRegExp::AreUnicodePropertiesSupported())
	612	{
	613	utf8FullSupport = false;
	614	CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!");
	615	}
	616
	617	if (!utf8FullSupport)
	618	{
	619	CLog::Log(LOGINFO,
	620	"Consider installing PCRE lib version 8.10 or later with enabled Unicode properties "
	621	"and UTF-8 support. Your PCRE lib version: %s",
	622	PCRE::pcre_version());
	623	#if PCRE_UCP == 0
	624	CLog::Log(LOGINFO, "You will need to rebuild XBMC after PCRE lib update.");
	625	#endif
	626	}
	627
	628	return utf8FullSupport;
	629	}
	630
	631	bool CRegExp::IsJitSupported(void)
	632	{
	633	if (m_JitSupported == -1)
	634	{
	635	#ifdef PCRE_HAS_JIT_CODE
	636	if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0)
	637	#endif
	638	m_JitSupported = 0;
	639	}
	640
	641	return m_JitSupported == 1;
	642	}