sync with upstream

author: manuel <manuel@mausz.at> 2020-10-19 00:52:24 +0200
committer: manuel <manuel@mausz.at> 2020-10-19 00:52:24 +0200
commit: be933ef2241d79558f91796cc5b3a161f72ebf9c (patch)
tree: fe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/ScraperParser.cpp
parent: 5f8335c1e49ce108ef3481863833c98efa00411b (diff)
download: kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip
1 files changed, 616 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperParser.cpp b/xbmc/utils/ScraperParser.cpp
new file mode 100644
index 0000000..81fcf37
--- /dev/null
+++ b/xbmc/utils/ScraperParser.cpp
@@ -0,0 +1,616 @@
+/*
+ *  Copyright (C) 2012-2018 Team Kodi
+ *  This file is part of Kodi - https://kodi.tv
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ *  See LICENSES/README.md for more information.
+ */
+#include "ScraperParser.h"
+#include "addons/AddonManager.h"
+#include "guilib/LocalizeStrings.h"
+#include "RegExp.h"
+#include "HTMLUtil.h"
+#include "addons/Scraper.h"
+#include "URL.h"
+#include "utils/StringUtils.h"
+#include "log.h"
+#include "CharsetConverter.h"
+#ifdef HAVE_LIBXSLT
+#include "utils/XSLTUtils.h"
+#endif
+#include "utils/XMLUtils.h"
+#include <sstream>
+#include <cstring>
+using namespace ADDON;
+using namespace XFILE;
+CScraperParser::CScraperParser()
+{
+  m_pRootElement = NULL;
+  m_document = NULL;
+  m_SearchStringEncoding = "UTF-8";
+  m_scraper = NULL;
+  m_isNoop = true;
+}
+CScraperParser::CScraperParser(const CScraperParser& parser)
+{
+  m_pRootElement = NULL;
+  m_document = NULL;
+  m_SearchStringEncoding = "UTF-8";
+  m_scraper = NULL;
+  m_isNoop = true;
+  *this = parser;
+}
+CScraperParser &CScraperParser::operator=(const CScraperParser &parser)
+{
+  if (this != &parser)
+  {
+    Clear();
+    if (parser.m_document)
+    {
+      m_scraper = parser.m_scraper;
+      m_document = new CXBMCTinyXML(*parser.m_document);
+      LoadFromXML();
+    }
+    else
+      m_scraper = NULL;
+  }
+  return *this;
+}
+CScraperParser::~CScraperParser()
+{
+  Clear();
+}
+void CScraperParser::Clear()
+{
+  m_pRootElement = NULL;
+  delete m_document;
+  m_document = NULL;
+  m_strFile.clear();
+}
+bool CScraperParser::Load(const std::string& strXMLFile)
+{
+  Clear();
+  m_document = new CXBMCTinyXML();
+  if (!m_document)
+    return false;
+  m_strFile = strXMLFile;
+  if (m_document->LoadFile(strXMLFile))
+    return LoadFromXML();
+  delete m_document;
+  m_document = NULL;
+  return false;
+}
+bool CScraperParser::LoadFromXML()
+{
+  if (!m_document)
+    return false;
+  m_pRootElement = m_document->RootElement();
+  std::string strValue = m_pRootElement->ValueStr();
+  if (strValue == "scraper")
+  {
+    TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl");
+    if (pChildElement)
+    {
+      m_isNoop = false;
+      if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
+        m_SearchStringEncoding = "UTF-8";
+    }
+    pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl");
+    if (pChildElement)
+    {
+      m_isNoop = false;
+      if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
+        m_SearchStringEncoding = "UTF-8";
+    }
+    pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl");
+    if (pChildElement)
+    {
+      m_isNoop = false;
+      if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
+        m_SearchStringEncoding = "UTF-8";
+    }
+    return true;
+  }
+  delete m_document;
+  m_document = NULL;
+  m_pRootElement = NULL;
+  return false;
+}
+void CScraperParser::ReplaceBuffers(std::string& strDest)
+{
+  // insert buffers
+  size_t iIndex;
+  for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--)
+  {
+    iIndex = 0;
+    std::string temp = StringUtils::Format("$$%i",i+1);
+    while ((iIndex = strDest.find(temp,iIndex)) != std::string::npos)
+    {
+      strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.size(),m_param[i]);
+      iIndex += m_param[i].length();
+    }
+  }
+  // insert settings
+  iIndex = 0;
+  while ((iIndex = strDest.find("$INFO[", iIndex)) != std::string::npos)
+  {
+    size_t iEnd = strDest.find("]", iIndex);
+    std::string strInfo = strDest.substr(iIndex+6, iEnd - iIndex - 6);
+    std::string strReplace;
+    if (m_scraper)
+      strReplace = m_scraper->GetSetting(strInfo);
+    strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
+    iIndex += strReplace.length();
+  }
+  // insert localize strings
+  iIndex = 0;
+  while ((iIndex = strDest.find("$LOCALIZE[", iIndex)) != std::string::npos)
+  {
+    size_t iEnd = strDest.find("]", iIndex);
+    std::string strInfo = strDest.substr(iIndex+10, iEnd - iIndex - 10);
+    std::string strReplace;
+    if (m_scraper)
+      strReplace = g_localizeStrings.GetAddonString(m_scraper->ID(), strtol(strInfo.c_str(),NULL,10));
+    strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
+    iIndex += strReplace.length();
+  }
+  iIndex = 0;
+  while ((iIndex = strDest.find("\\n",iIndex)) != std::string::npos)
+    strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n");
+}
+void CScraperParser::ParseExpression(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
+{
+  std::string strOutput = XMLUtils::GetAttribute(element, "output");
+  TiXmlElement* pExpression = element->FirstChildElement("expression");
+  if (pExpression)
+  {
+    bool bInsensitive=true;
+    const char* sensitive = pExpression->Attribute("cs");
+    if (sensitive)
+      if (StringUtils::CompareNoCase(sensitive, "yes") == 0)
+        bInsensitive=false; // match case sensitive
+    CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8;
+    const char* const strUtf8 = pExpression->Attribute("utf8");
+    if (strUtf8)
+    {
+      if (StringUtils::CompareNoCase(strUtf8, "yes") == 0)
+        eUtf8 = CRegExp::forceUtf8;
+      else if (StringUtils::CompareNoCase(strUtf8, "no") == 0)
+        eUtf8 = CRegExp::asciiOnly;
+      else if (StringUtils::CompareNoCase(strUtf8, "auto") == 0)
+        eUtf8 = CRegExp::autoUtf8;
+    }
+    CRegExp reg(bInsensitive, eUtf8);
+    std::string strExpression;
+    if (pExpression->FirstChild())
+      strExpression = pExpression->FirstChild()->Value();
+    else
+      strExpression = "(.*)";
+    ReplaceBuffers(strExpression);
+    ReplaceBuffers(strOutput);
+    if (!reg.RegComp(strExpression.c_str()))
+    {
+      return;
+    }
+    bool bRepeat = false;
+    const char* szRepeat = pExpression->Attribute("repeat");
+    if (szRepeat)
+      if (StringUtils::CompareNoCase(szRepeat, "yes") == 0)
+        bRepeat = true;
+    const char* szClear = pExpression->Attribute("clear");
+    if (szClear)
+      if (StringUtils::CompareNoCase(szClear, "yes") == 0)
+        dest=""; // clear no matter if regexp fails
+    bool bClean[MAX_SCRAPER_BUFFERS];
+    GetBufferParams(bClean,pExpression->Attribute("noclean"),true);
+    bool bTrim[MAX_SCRAPER_BUFFERS];
+    GetBufferParams(bTrim,pExpression->Attribute("trim"),false);
+    bool bFixChars[MAX_SCRAPER_BUFFERS];
+    GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false);
+    bool bEncode[MAX_SCRAPER_BUFFERS];
+    GetBufferParams(bEncode,pExpression->Attribute("encode"),false);
+    int iOptional = -1;
+    pExpression->QueryIntAttribute("optional",&iOptional);
+    int iCompare = -1;
+    pExpression->QueryIntAttribute("compare",&iCompare);
+    if (iCompare > -1)
+      StringUtils::ToLower(m_param[iCompare-1]);
+    std::string curInput = input;
+    for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
+    {
+      if (bClean[iBuf])
+        InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!");
+      if (bTrim[iBuf])
+        InsertToken(strOutput,iBuf+1,"!!!TRIM!!!");
+      if (bFixChars[iBuf])
+        InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!");
+      if (bEncode[iBuf])
+        InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!");
+    }
+    int i = reg.RegFind(curInput.c_str());
+    while (i > -1 && (i < (int)curInput.size() || curInput.empty()))
+    {
+      if (!bAppend)
+      {
+        dest = "";
+        bAppend = true;
+      }
+      std::string strCurOutput=strOutput;
+      if (iOptional > -1) // check that required param is there
+      {
+        char temp[12];
+        sprintf(temp,"\\%i",iOptional);
+        std::string szParam = reg.GetReplaceString(temp);
+        CRegExp reg2;
+        reg2.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)");
+        int i2=reg2.RegFind(strCurOutput.c_str());
+        while (i2 > -1)
+        {
+          std::string szRemove(reg2.GetMatch(2));
+          int iRemove = szRemove.size();
+          int i3 = strCurOutput.find(szRemove);
+          if (!szParam.empty())
+          {
+            strCurOutput.erase(i3+iRemove,2);
+            strCurOutput.erase(i3,2);
+          }
+          else
+            strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,"");
+          i2 = reg2.RegFind(strCurOutput.c_str());
+        }
+      }
+      int iLen = reg.GetFindLen();
+      // nasty hack #1 - & means \0 in a replace string
+      StringUtils::Replace(strCurOutput, "&","!!!AMPAMP!!!");
+      std::string result = reg.GetReplaceString(strCurOutput.c_str());
+      if (!result.empty())
+      {
+        std::string strResult(result);
+        StringUtils::Replace(strResult, "!!!AMPAMP!!!","&");
+        Clean(strResult);
+        ReplaceBuffers(strResult);
+        if (iCompare > -1)
+        {
+          std::string strResultNoCase = strResult;
+          StringUtils::ToLower(strResultNoCase);
+          if (strResultNoCase.find(m_param[iCompare-1]) != std::string::npos)
+            dest += strResult;
+        }
+        else
+          dest += strResult;
+      }
+      if (bRepeat && iLen > 0)
+      {
+        curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen);
+        i = reg.RegFind(curInput.c_str());
+      }
+      else
+        i = -1;
+    }
+  }
+}
+void CScraperParser::ParseXSLT(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
+{
+#ifdef HAVE_LIBXSLT
+  TiXmlElement* pSheet = element->FirstChildElement();
+  if (pSheet)
+  {
+    XSLTUtils xsltUtils;
+    std::string strXslt;
+    strXslt << *pSheet;
+    ReplaceBuffers(strXslt);
+    if (!xsltUtils.SetInput(input))
+      CLog::Log(LOGDEBUG, "could not parse input XML");
+    if (!xsltUtils.SetStylesheet(strXslt))
+      CLog::Log(LOGDEBUG, "could not parse stylesheet XML");
+    xsltUtils.XSLTTransform(dest);
+  }
+#endif
+}
+TiXmlElement *FirstChildScraperElement(TiXmlElement *element)
+{
+  for (TiXmlElement *child = element->FirstChildElement(); child; child = child->NextSiblingElement())
+  {
+#ifdef HAVE_LIBXSLT
+    if (child->ValueStr() == "XSLT")
+      return child;
+#endif
+    if (child->ValueStr() == "RegExp")
+      return child;
+  }
+  return NULL;
+}
+TiXmlElement *NextSiblingScraperElement(TiXmlElement *element)
+{
+  for (TiXmlElement *next = element->NextSiblingElement(); next; next = next->NextSiblingElement())
+  {
+#ifdef HAVE_LIBXSLT
+    if (next->ValueStr() == "XSLT")
+      return next;
+#endif
+    if (next->ValueStr() == "RegExp")
+      return next;
+  }
+  return NULL;
+}
+void CScraperParser::ParseNext(TiXmlElement* element)
+{
+  TiXmlElement* pReg = element;
+  while (pReg)
+  {
+    TiXmlElement* pChildReg = FirstChildScraperElement(pReg);
+    if (pChildReg)
+      ParseNext(pChildReg);
+    else
+    {
+      TiXmlElement* pChildReg = pReg->FirstChildElement("clear");
+      if (pChildReg)
+        ParseNext(pChildReg);
+    }
+    int iDest = 1;
+    bool bAppend = false;
+    const char* szDest = pReg->Attribute("dest");
+    if (szDest && strlen(szDest))
+    {
+      if (szDest[strlen(szDest)-1] == '+')
+        bAppend = true;
+      iDest = atoi(szDest);
+    }
+    const char *szInput = pReg->Attribute("input");
+    std::string strInput;
+    if (szInput)
+    {
+      strInput = szInput;
+      ReplaceBuffers(strInput);
+    }
+    else
+      strInput = m_param[0];
+    const char* szConditional = pReg->Attribute("conditional");
+    bool bExecute = true;
+    if (szConditional)
+    {
+      bool bInverse=false;
+      if (szConditional[0] == '!')
+      {
+        bInverse = true;
+        szConditional++;
+      }
+      std::string strSetting;
+      if (m_scraper && m_scraper->HasSettings())
+        strSetting = m_scraper->GetSetting(szConditional);
+      bExecute = bInverse != (strSetting == "true");
+    }
+    if (bExecute)
+    {
+      if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1)
+      {
+#ifdef HAVE_LIBXSLT
+        if (pReg->ValueStr() == "XSLT")
+          ParseXSLT(strInput, m_param[iDest - 1], pReg, bAppend);
+        else
+#endif
+          ParseExpression(strInput, m_param[iDest - 1],pReg,bAppend);
+      }
+      else
+        CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer "
+                           "out of bounds, skipping expression");
+    }
+    pReg = NextSiblingScraperElement(pReg);
+  }
+}
+const std::string CScraperParser::Parse(const std::string& strTag,
+                                       CScraper* scraper)
+{
+  TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str());
+  if(pChildElement == NULL)
+  {
+    CLog::Log(LOGERROR,"%s: Could not find scraper function %s",__FUNCTION__,strTag.c_str());
+    return "";
+  }
+  int iResult = 1; // default to param 1
+  pChildElement->QueryIntAttribute("dest",&iResult);
+  TiXmlElement* pChildStart = FirstChildScraperElement(pChildElement);
+  m_scraper = scraper;
+  ParseNext(pChildStart);
+  std::string tmp = m_param[iResult-1];
+  const char* szClearBuffers = pChildElement->Attribute("clearbuffers");
+  if (!szClearBuffers || StringUtils::CompareNoCase(szClearBuffers, "no") != 0)
+    ClearBuffers();
+  return tmp;
+}
+void CScraperParser::Clean(std::string& strDirty)
+{
+  size_t i = 0;
+  std::string strBuffer;
+  while ((i = strDirty.find("!!!CLEAN!!!",i)) != std::string::npos)
+  {
+    size_t i2;
+    if ((i2 = strDirty.find("!!!CLEAN!!!",i+11)) != std::string::npos)
+    {
+      strBuffer = strDirty.substr(i+11,i2-i-11);
+      std::string strConverted(strBuffer);
+      HTML::CHTMLUtil::RemoveTags(strConverted);
+      StringUtils::Trim(strConverted);
+      strDirty.replace(i, i2-i+11, strConverted);
+      i += strConverted.size();
+    }
+    else
+      break;
+  }
+  i=0;
+  while ((i = strDirty.find("!!!TRIM!!!",i)) != std::string::npos)
+  {
+    size_t i2;
+    if ((i2 = strDirty.find("!!!TRIM!!!",i+10)) != std::string::npos)
+    {
+      strBuffer = strDirty.substr(i+10,i2-i-10);
+      StringUtils::Trim(strBuffer);
+      strDirty.replace(i, i2-i+10, strBuffer);
+      i += strBuffer.size();
+    }
+    else
+      break;
+  }
+  i=0;
+  while ((i = strDirty.find("!!!FIXCHARS!!!",i)) != std::string::npos)
+  {
+    size_t i2;
+    if ((i2 = strDirty.find("!!!FIXCHARS!!!",i+14)) != std::string::npos)
+    {
+      strBuffer = strDirty.substr(i+14,i2-i-14);
+      std::wstring wbuffer;
+      g_charsetConverter.utf8ToW(strBuffer, wbuffer, false, false, false);
+      std::wstring wConverted;
+      HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted);
+      g_charsetConverter.wToUTF8(wConverted, strBuffer, false);
+      StringUtils::Trim(strBuffer);
+      ConvertJSON(strBuffer);
+      strDirty.replace(i, i2-i+14, strBuffer);
+      i += strBuffer.size();
+    }
+    else
+      break;
+  }
+  i=0;
+  while ((i=strDirty.find("!!!ENCODE!!!",i)) != std::string::npos)
+  {
+    size_t i2;
+    if ((i2 = strDirty.find("!!!ENCODE!!!",i+12)) != std::string::npos)
+    {
+      strBuffer = CURL::Encode(strDirty.substr(i + 12, i2 - i - 12));
+      strDirty.replace(i, i2-i+12, strBuffer);
+      i += strBuffer.size();
+    }
+    else
+      break;
+  }
+}
+void CScraperParser::ConvertJSON(std::string &string)
+{
+  CRegExp reg;
+  reg.RegComp("\\\\u([0-f]{4})");
+  while (reg.RegFind(string.c_str()) > -1)
+  {
+    int pos = reg.GetSubStart(1);
+    std::string szReplace(reg.GetMatch(1));
+    std::string replace = StringUtils::Format("&#x%s;", szReplace.c_str());
+    string.replace(string.begin()+pos-2, string.begin()+pos+4, replace);
+  }
+  CRegExp reg2;
+  reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
+  while (reg2.RegFind(string.c_str()) > -1)
+  {
+    int pos1 = reg2.GetSubStart(1);
+    int pos2 = reg2.GetSubStart(2);
+    std::string szHexValue(reg2.GetMatch(1));
+    std::string replace = StringUtils::Format("%li", strtol(szHexValue.c_str(), NULL, 16));
+    string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace);
+  }
+  StringUtils::Replace(string, "\\\"","\"");
+}
+void CScraperParser::ClearBuffers()
+{
+  //clear all m_param strings
+  for (std::string& param : m_param)
+    param.clear();
+}
+void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue)
+{
+  for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
+    result[iBuf] = defvalue;
+  if (attribute)
+  {
+    std::vector<std::string> vecBufs;
+    StringUtils::Tokenize(attribute,vecBufs,",");
+    for (size_t nToken=0; nToken < vecBufs.size(); nToken++)
+    {
+      int index = atoi(vecBufs[nToken].c_str())-1;
+      if (index < MAX_SCRAPER_BUFFERS)
+        result[index] = !defvalue;
+    }
+  }
+}
+void CScraperParser::InsertToken(std::string& strOutput, int buf, const char* token)
+{
+  char temp[4];
+  sprintf(temp,"\\%i",buf);
+  size_t i2=0;
+  while ((i2 = strOutput.find(temp,i2)) != std::string::npos)
+  {
+    strOutput.insert(i2,token);
+    i2 += strlen(token) + strlen(temp);
+    strOutput.insert(i2,token);
+  }
+}
+void CScraperParser::AddDocument(const CXBMCTinyXML* doc)
+{
+  const TiXmlNode* node = doc->RootElement()->FirstChild();
+  while (node)
+  {
+    m_pRootElement->InsertEndChild(*node);
+    node = node->NextSibling();
+  }
+}
author	manuel <manuel@mausz.at>	2020-10-19 00:52:24 +0200
committer	manuel <manuel@mausz.at>	2020-10-19 00:52:24 +0200
commit	be933ef2241d79558f91796cc5b3a161f72ebf9c (patch)
tree	fe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/ScraperParser.cpp
parent	5f8335c1e49ce108ef3481863833c98efa00411b (diff)
download	kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2 kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip

diff --git a/xbmc/utils/ScraperParser.cpp b/xbmc/utils/ScraperParser.cpp new file mode 100644 index 0000000..81fcf37 --- /dev/null +++ b/xbmc/utils/ScraperParser.cpp
@@ -0,0 +1,616 @@
	1	/*
	2	* Copyright (C) 2012-2018 Team Kodi
	3	* This file is part of Kodi - https://kodi.tv
	4	*
	5	* SPDX-License-Identifier: GPL-2.0-or-later
	6	* See LICENSES/README.md for more information.
	7	*/
	8
	9	#include "ScraperParser.h"
	10
	11	#include "addons/AddonManager.h"
	12	#include "guilib/LocalizeStrings.h"
	13	#include "RegExp.h"
	14	#include "HTMLUtil.h"
	15	#include "addons/Scraper.h"
	16	#include "URL.h"
	17	#include "utils/StringUtils.h"
	18	#include "log.h"
	19	#include "CharsetConverter.h"
	20	#ifdef HAVE_LIBXSLT
	21	#include "utils/XSLTUtils.h"
	22	#endif
	23	#include "utils/XMLUtils.h"
	24	#include <sstream>
	25	#include <cstring>
	26
	27	using namespace ADDON;
	28	using namespace XFILE;
	29
	30	CScraperParser::CScraperParser()
	31	{
	32	m_pRootElement = NULL;
	33	m_document = NULL;
	34	m_SearchStringEncoding = "UTF-8";
	35	m_scraper = NULL;
	36	m_isNoop = true;
	37	}
	38
	39	CScraperParser::CScraperParser(const CScraperParser& parser)
	40	{
	41	m_pRootElement = NULL;
	42	m_document = NULL;
	43	m_SearchStringEncoding = "UTF-8";
	44	m_scraper = NULL;
	45	m_isNoop = true;
	46	*this = parser;
	47	}
	48
	49	CScraperParser &CScraperParser::operator=(const CScraperParser &parser)
	50	{
	51	if (this != &parser)
	52	{
	53	Clear();
	54	if (parser.m_document)
	55	{
	56	m_scraper = parser.m_scraper;
	57	m_document = new CXBMCTinyXML(*parser.m_document);
	58	LoadFromXML();
	59	}
	60	else
	61	m_scraper = NULL;
	62	}
	63	return *this;
	64	}
	65
	66	CScraperParser::~CScraperParser()
	67	{
	68	Clear();
	69	}
	70
	71	void CScraperParser::Clear()
	72	{
	73	m_pRootElement = NULL;
	74	delete m_document;
	75
	76	m_document = NULL;
	77	m_strFile.clear();
	78	}
	79
	80	bool CScraperParser::Load(const std::string& strXMLFile)
	81	{
	82	Clear();
	83
	84	m_document = new CXBMCTinyXML();
	85
	86	if (!m_document)
	87	return false;
	88
	89	m_strFile = strXMLFile;
	90
	91	if (m_document->LoadFile(strXMLFile))
	92	return LoadFromXML();
	93
	94	delete m_document;
	95	m_document = NULL;
	96	return false;
	97	}
	98
	99	bool CScraperParser::LoadFromXML()
	100	{
	101	if (!m_document)
	102	return false;
	103
	104	m_pRootElement = m_document->RootElement();
	105	std::string strValue = m_pRootElement->ValueStr();
	106	if (strValue == "scraper")
	107	{
	108	TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl");
	109	if (pChildElement)
	110	{
	111	m_isNoop = false;
	112	if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
	113	m_SearchStringEncoding = "UTF-8";
	114	}
	115
	116	pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl");
	117	if (pChildElement)
	118	{
	119	m_isNoop = false;
	120	if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
	121	m_SearchStringEncoding = "UTF-8";
	122	}
	123	pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl");
	124	if (pChildElement)
	125	{
	126	m_isNoop = false;
	127	if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
	128	m_SearchStringEncoding = "UTF-8";
	129	}
	130
	131	return true;
	132	}
	133
	134	delete m_document;
	135	m_document = NULL;
	136	m_pRootElement = NULL;
	137	return false;
	138	}
	139
	140	void CScraperParser::ReplaceBuffers(std::string& strDest)
	141	{
	142	// insert buffers
	143	size_t iIndex;
	144	for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--)
	145	{
	146	iIndex = 0;
	147	std::string temp = StringUtils::Format("$$%i",i+1);
	148	while ((iIndex = strDest.find(temp,iIndex)) != std::string::npos)
	149	{
	150	strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.size(),m_param[i]);
	151	iIndex += m_param[i].length();
	152	}
	153	}
	154	// insert settings
	155	iIndex = 0;
	156	while ((iIndex = strDest.find("$INFO[", iIndex)) != std::string::npos)
	157	{
	158	size_t iEnd = strDest.find("]", iIndex);
	159	std::string strInfo = strDest.substr(iIndex+6, iEnd - iIndex - 6);
	160	std::string strReplace;
	161	if (m_scraper)
	162	strReplace = m_scraper->GetSetting(strInfo);
	163	strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
	164	iIndex += strReplace.length();
	165	}
	166	// insert localize strings
	167	iIndex = 0;
	168	while ((iIndex = strDest.find("$LOCALIZE[", iIndex)) != std::string::npos)
	169	{
	170	size_t iEnd = strDest.find("]", iIndex);
	171	std::string strInfo = strDest.substr(iIndex+10, iEnd - iIndex - 10);
	172	std::string strReplace;
	173	if (m_scraper)
	174	strReplace = g_localizeStrings.GetAddonString(m_scraper->ID(), strtol(strInfo.c_str(),NULL,10));
	175	strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
	176	iIndex += strReplace.length();
	177	}
	178	iIndex = 0;
	179	while ((iIndex = strDest.find("\\n",iIndex)) != std::string::npos)
	180	strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n");
	181	}
	182
	183	void CScraperParser::ParseExpression(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
	184	{
	185	std::string strOutput = XMLUtils::GetAttribute(element, "output");
	186
	187	TiXmlElement* pExpression = element->FirstChildElement("expression");
	188	if (pExpression)
	189	{
	190	bool bInsensitive=true;
	191	const char* sensitive = pExpression->Attribute("cs");
	192	if (sensitive)
	193	if (StringUtils::CompareNoCase(sensitive, "yes") == 0)
	194	bInsensitive=false; // match case sensitive
	195
	196	CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8;
	197	const char* const strUtf8 = pExpression->Attribute("utf8");
	198	if (strUtf8)
	199	{
	200	if (StringUtils::CompareNoCase(strUtf8, "yes") == 0)
	201	eUtf8 = CRegExp::forceUtf8;
	202	else if (StringUtils::CompareNoCase(strUtf8, "no") == 0)
	203	eUtf8 = CRegExp::asciiOnly;
	204	else if (StringUtils::CompareNoCase(strUtf8, "auto") == 0)
	205	eUtf8 = CRegExp::autoUtf8;
	206	}
	207
	208	CRegExp reg(bInsensitive, eUtf8);
	209	std::string strExpression;
	210	if (pExpression->FirstChild())
	211	strExpression = pExpression->FirstChild()->Value();
	212	else
	213	strExpression = "(.*)";
	214	ReplaceBuffers(strExpression);
	215	ReplaceBuffers(strOutput);
	216
	217	if (!reg.RegComp(strExpression.c_str()))
	218	{
	219	return;
	220	}
	221
	222	bool bRepeat = false;
	223	const char* szRepeat = pExpression->Attribute("repeat");
	224	if (szRepeat)
	225	if (StringUtils::CompareNoCase(szRepeat, "yes") == 0)
	226	bRepeat = true;
	227
	228	const char* szClear = pExpression->Attribute("clear");
	229	if (szClear)
	230	if (StringUtils::CompareNoCase(szClear, "yes") == 0)
	231	dest=""; // clear no matter if regexp fails
	232
	233	bool bClean[MAX_SCRAPER_BUFFERS];
	234	GetBufferParams(bClean,pExpression->Attribute("noclean"),true);
	235
	236	bool bTrim[MAX_SCRAPER_BUFFERS];
	237	GetBufferParams(bTrim,pExpression->Attribute("trim"),false);
	238
	239	bool bFixChars[MAX_SCRAPER_BUFFERS];
	240	GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false);
	241
	242	bool bEncode[MAX_SCRAPER_BUFFERS];
	243	GetBufferParams(bEncode,pExpression->Attribute("encode"),false);
	244
	245	int iOptional = -1;
	246	pExpression->QueryIntAttribute("optional",&iOptional);
	247
	248	int iCompare = -1;
	249	pExpression->QueryIntAttribute("compare",&iCompare);
	250	if (iCompare > -1)
	251	StringUtils::ToLower(m_param[iCompare-1]);
	252	std::string curInput = input;
	253	for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
	254	{
	255	if (bClean[iBuf])
	256	InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!");
	257	if (bTrim[iBuf])
	258	InsertToken(strOutput,iBuf+1,"!!!TRIM!!!");
	259	if (bFixChars[iBuf])
	260	InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!");
	261	if (bEncode[iBuf])
	262	InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!");
	263	}
	264	int i = reg.RegFind(curInput.c_str());
	265	while (i > -1 && (i < (int)curInput.size() \|\| curInput.empty()))
	266	{
	267	if (!bAppend)
	268	{
	269	dest = "";
	270	bAppend = true;
	271	}
	272	std::string strCurOutput=strOutput;
	273
	274	if (iOptional > -1) // check that required param is there
	275	{
	276	char temp[12];
	277	sprintf(temp,"\\%i",iOptional);
	278	std::string szParam = reg.GetReplaceString(temp);
	279	CRegExp reg2;
	280	reg2.RegComp("(.)(\\\\\\(.\\\\2.)\\\\\\)(.)");
	281	int i2=reg2.RegFind(strCurOutput.c_str());
	282	while (i2 > -1)
	283	{
	284	std::string szRemove(reg2.GetMatch(2));
	285	int iRemove = szRemove.size();
	286	int i3 = strCurOutput.find(szRemove);
	287	if (!szParam.empty())
	288	{
	289	strCurOutput.erase(i3+iRemove,2);
	290	strCurOutput.erase(i3,2);
	291	}
	292	else
	293	strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,"");
	294
	295	i2 = reg2.RegFind(strCurOutput.c_str());
	296	}
	297	}
	298
	299	int iLen = reg.GetFindLen();
	300	// nasty hack #1 - & means \0 in a replace string
	301	StringUtils::Replace(strCurOutput, "&","!!!AMPAMP!!!");
	302	std::string result = reg.GetReplaceString(strCurOutput.c_str());
	303	if (!result.empty())
	304	{
	305	std::string strResult(result);
	306	StringUtils::Replace(strResult, "!!!AMPAMP!!!","&");
	307	Clean(strResult);
	308	ReplaceBuffers(strResult);
	309	if (iCompare > -1)
	310	{
	311	std::string strResultNoCase = strResult;
	312	StringUtils::ToLower(strResultNoCase);
	313	if (strResultNoCase.find(m_param[iCompare-1]) != std::string::npos)
	314	dest += strResult;
	315	}
	316	else
	317	dest += strResult;
	318	}
	319	if (bRepeat && iLen > 0)
	320	{
	321	curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen);
	322	i = reg.RegFind(curInput.c_str());
	323	}
	324	else
	325	i = -1;
	326	}
	327	}
	328	}
	329
	330	void CScraperParser::ParseXSLT(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
	331	{
	332	#ifdef HAVE_LIBXSLT
	333	TiXmlElement* pSheet = element->FirstChildElement();
	334	if (pSheet)
	335	{
	336	XSLTUtils xsltUtils;
	337	std::string strXslt;
	338	strXslt << *pSheet;
	339	ReplaceBuffers(strXslt);
	340
	341	if (!xsltUtils.SetInput(input))
	342	CLog::Log(LOGDEBUG, "could not parse input XML");
	343
	344	if (!xsltUtils.SetStylesheet(strXslt))
	345	CLog::Log(LOGDEBUG, "could not parse stylesheet XML");
	346
	347	xsltUtils.XSLTTransform(dest);
	348	}
	349	#endif
	350	}
	351
	352	TiXmlElement FirstChildScraperElement(TiXmlElement element)
	353	{
	354	for (TiXmlElement *child = element->FirstChildElement(); child; child = child->NextSiblingElement())
	355	{
	356	#ifdef HAVE_LIBXSLT
	357	if (child->ValueStr() == "XSLT")
	358	return child;
	359	#endif
	360	if (child->ValueStr() == "RegExp")
	361	return child;
	362	}
	363	return NULL;
	364	}
	365
	366	TiXmlElement NextSiblingScraperElement(TiXmlElement element)
	367	{
	368	for (TiXmlElement *next = element->NextSiblingElement(); next; next = next->NextSiblingElement())
	369	{
	370	#ifdef HAVE_LIBXSLT
	371	if (next->ValueStr() == "XSLT")
	372	return next;
	373	#endif
	374	if (next->ValueStr() == "RegExp")
	375	return next;
	376	}
	377	return NULL;
	378	}
	379
	380	void CScraperParser::ParseNext(TiXmlElement* element)
	381	{
	382	TiXmlElement* pReg = element;
	383	while (pReg)
	384	{
	385	TiXmlElement* pChildReg = FirstChildScraperElement(pReg);
	386	if (pChildReg)
	387	ParseNext(pChildReg);
	388	else
	389	{
	390	TiXmlElement* pChildReg = pReg->FirstChildElement("clear");
	391	if (pChildReg)
	392	ParseNext(pChildReg);
	393	}
	394
	395	int iDest = 1;
	396	bool bAppend = false;
	397	const char* szDest = pReg->Attribute("dest");
	398	if (szDest && strlen(szDest))
	399	{
	400	if (szDest[strlen(szDest)-1] == '+')
	401	bAppend = true;
	402
	403	iDest = atoi(szDest);
	404	}
	405
	406	const char *szInput = pReg->Attribute("input");
	407	std::string strInput;
	408	if (szInput)
	409	{
	410	strInput = szInput;
	411	ReplaceBuffers(strInput);
	412	}
	413	else
	414	strInput = m_param[0];
	415
	416	const char* szConditional = pReg->Attribute("conditional");
	417	bool bExecute = true;
	418	if (szConditional)
	419	{
	420	bool bInverse=false;
	421	if (szConditional[0] == '!')
	422	{
	423	bInverse = true;
	424	szConditional++;
	425	}
	426	std::string strSetting;
	427	if (m_scraper && m_scraper->HasSettings())
	428	strSetting = m_scraper->GetSetting(szConditional);
	429	bExecute = bInverse != (strSetting == "true");
	430	}
	431
	432	if (bExecute)
	433	{
	434	if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1)
	435	{
	436	#ifdef HAVE_LIBXSLT
	437	if (pReg->ValueStr() == "XSLT")
	438	ParseXSLT(strInput, m_param[iDest - 1], pReg, bAppend);
	439	else
	440	#endif
	441	ParseExpression(strInput, m_param[iDest - 1],pReg,bAppend);
	442	}
	443	else
	444	CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer "
	445	"out of bounds, skipping expression");
	446	}
	447	pReg = NextSiblingScraperElement(pReg);
	448	}
	449	}
	450
	451	const std::string CScraperParser::Parse(const std::string& strTag,
	452	CScraper* scraper)
	453	{
	454	TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str());
	455	if(pChildElement == NULL)
	456	{
	457	CLog::Log(LOGERROR,"%s: Could not find scraper function %s",__FUNCTION__,strTag.c_str());
	458	return "";
	459	}
	460	int iResult = 1; // default to param 1
	461	pChildElement->QueryIntAttribute("dest",&iResult);
	462	TiXmlElement* pChildStart = FirstChildScraperElement(pChildElement);
	463	m_scraper = scraper;
	464	ParseNext(pChildStart);
	465	std::string tmp = m_param[iResult-1];
	466
	467	const char* szClearBuffers = pChildElement->Attribute("clearbuffers");
	468	if (!szClearBuffers \|\| StringUtils::CompareNoCase(szClearBuffers, "no") != 0)
	469	ClearBuffers();
	470
	471	return tmp;
	472	}
	473
	474	void CScraperParser::Clean(std::string& strDirty)
	475	{
	476	size_t i = 0;
	477	std::string strBuffer;
	478	while ((i = strDirty.find("!!!CLEAN!!!",i)) != std::string::npos)
	479	{
	480	size_t i2;
	481	if ((i2 = strDirty.find("!!!CLEAN!!!",i+11)) != std::string::npos)
	482	{
	483	strBuffer = strDirty.substr(i+11,i2-i-11);
	484	std::string strConverted(strBuffer);
	485	HTML::CHTMLUtil::RemoveTags(strConverted);
	486	StringUtils::Trim(strConverted);
	487	strDirty.replace(i, i2-i+11, strConverted);
	488	i += strConverted.size();
	489	}
	490	else
	491	break;
	492	}
	493	i=0;
	494	while ((i = strDirty.find("!!!TRIM!!!",i)) != std::string::npos)
	495	{
	496	size_t i2;
	497	if ((i2 = strDirty.find("!!!TRIM!!!",i+10)) != std::string::npos)
	498	{
	499	strBuffer = strDirty.substr(i+10,i2-i-10);
	500	StringUtils::Trim(strBuffer);
	501	strDirty.replace(i, i2-i+10, strBuffer);
	502	i += strBuffer.size();
	503	}
	504	else
	505	break;
	506	}
	507	i=0;
	508	while ((i = strDirty.find("!!!FIXCHARS!!!",i)) != std::string::npos)
	509	{
	510	size_t i2;
	511	if ((i2 = strDirty.find("!!!FIXCHARS!!!",i+14)) != std::string::npos)
	512	{
	513	strBuffer = strDirty.substr(i+14,i2-i-14);
	514	std::wstring wbuffer;
	515	g_charsetConverter.utf8ToW(strBuffer, wbuffer, false, false, false);
	516	std::wstring wConverted;
	517	HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted);
	518	g_charsetConverter.wToUTF8(wConverted, strBuffer, false);
	519	StringUtils::Trim(strBuffer);
	520	ConvertJSON(strBuffer);
	521	strDirty.replace(i, i2-i+14, strBuffer);
	522	i += strBuffer.size();
	523	}
	524	else
	525	break;
	526	}
	527	i=0;
	528	while ((i=strDirty.find("!!!ENCODE!!!",i)) != std::string::npos)
	529	{
	530	size_t i2;
	531	if ((i2 = strDirty.find("!!!ENCODE!!!",i+12)) != std::string::npos)
	532	{
	533	strBuffer = CURL::Encode(strDirty.substr(i + 12, i2 - i - 12));
	534	strDirty.replace(i, i2-i+12, strBuffer);
	535	i += strBuffer.size();
	536	}
	537	else
	538	break;
	539	}
	540	}
	541
	542	void CScraperParser::ConvertJSON(std::string &string)
	543	{
	544	CRegExp reg;
	545	reg.RegComp("\\\\u([0-f]{4})");
	546	while (reg.RegFind(string.c_str()) > -1)
	547	{
	548	int pos = reg.GetSubStart(1);
	549	std::string szReplace(reg.GetMatch(1));
	550
	551	std::string replace = StringUtils::Format("&#x%s;", szReplace.c_str());
	552	string.replace(string.begin()+pos-2, string.begin()+pos+4, replace);
	553	}
	554
	555	CRegExp reg2;
	556	reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
	557	while (reg2.RegFind(string.c_str()) > -1)
	558	{
	559	int pos1 = reg2.GetSubStart(1);
	560	int pos2 = reg2.GetSubStart(2);
	561	std::string szHexValue(reg2.GetMatch(1));
	562
	563	std::string replace = StringUtils::Format("%li", strtol(szHexValue.c_str(), NULL, 16));
	564	string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace);
	565	}
	566
	567	StringUtils::Replace(string, "\\\"","\"");
	568	}
	569
	570	void CScraperParser::ClearBuffers()
	571	{
	572	//clear all m_param strings
	573	for (std::string& param : m_param)
	574	param.clear();
	575	}
	576
	577	void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue)
	578	{
	579	for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
	580	result[iBuf] = defvalue;
	581	if (attribute)
	582	{
	583	std::vector<std::string> vecBufs;
	584	StringUtils::Tokenize(attribute,vecBufs,",");
	585	for (size_t nToken=0; nToken < vecBufs.size(); nToken++)
	586	{
	587	int index = atoi(vecBufs[nToken].c_str())-1;
	588	if (index < MAX_SCRAPER_BUFFERS)
	589	result[index] = !defvalue;
	590	}
	591	}
	592	}
	593
	594	void CScraperParser::InsertToken(std::string& strOutput, int buf, const char* token)
	595	{
	596	char temp[4];
	597	sprintf(temp,"\\%i",buf);
	598	size_t i2=0;
	599	while ((i2 = strOutput.find(temp,i2)) != std::string::npos)
	600	{
	601	strOutput.insert(i2,token);
	602	i2 += strlen(token) + strlen(temp);
	603	strOutput.insert(i2,token);
	604	}
	605	}
	606
	607	void CScraperParser::AddDocument(const CXBMCTinyXML* doc)
	608	{
	609	const TiXmlNode* node = doc->RootElement()->FirstChild();
	610	while (node)
	611	{
	612	m_pRootElement->InsertEndChild(*node);
	613	node = node->NextSibling();
	614	}
	615	}
	616