diff options
Diffstat (limited to 'xbmc/utils/ScraperParser.cpp')
| -rw-r--r-- | xbmc/utils/ScraperParser.cpp | 616 |
1 files changed, 616 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperParser.cpp b/xbmc/utils/ScraperParser.cpp new file mode 100644 index 0000000..81fcf37 --- /dev/null +++ b/xbmc/utils/ScraperParser.cpp | |||
| @@ -0,0 +1,616 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012-2018 Team Kodi | ||
| 3 | * This file is part of Kodi - https://kodi.tv | ||
| 4 | * | ||
| 5 | * SPDX-License-Identifier: GPL-2.0-or-later | ||
| 6 | * See LICENSES/README.md for more information. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include "ScraperParser.h" | ||
| 10 | |||
| 11 | #include "addons/AddonManager.h" | ||
| 12 | #include "guilib/LocalizeStrings.h" | ||
| 13 | #include "RegExp.h" | ||
| 14 | #include "HTMLUtil.h" | ||
| 15 | #include "addons/Scraper.h" | ||
| 16 | #include "URL.h" | ||
| 17 | #include "utils/StringUtils.h" | ||
| 18 | #include "log.h" | ||
| 19 | #include "CharsetConverter.h" | ||
| 20 | #ifdef HAVE_LIBXSLT | ||
| 21 | #include "utils/XSLTUtils.h" | ||
| 22 | #endif | ||
| 23 | #include "utils/XMLUtils.h" | ||
| 24 | #include <sstream> | ||
| 25 | #include <cstring> | ||
| 26 | |||
| 27 | using namespace ADDON; | ||
| 28 | using namespace XFILE; | ||
| 29 | |||
| 30 | CScraperParser::CScraperParser() | ||
| 31 | { | ||
| 32 | m_pRootElement = NULL; | ||
| 33 | m_document = NULL; | ||
| 34 | m_SearchStringEncoding = "UTF-8"; | ||
| 35 | m_scraper = NULL; | ||
| 36 | m_isNoop = true; | ||
| 37 | } | ||
| 38 | |||
| 39 | CScraperParser::CScraperParser(const CScraperParser& parser) | ||
| 40 | { | ||
| 41 | m_pRootElement = NULL; | ||
| 42 | m_document = NULL; | ||
| 43 | m_SearchStringEncoding = "UTF-8"; | ||
| 44 | m_scraper = NULL; | ||
| 45 | m_isNoop = true; | ||
| 46 | *this = parser; | ||
| 47 | } | ||
| 48 | |||
| 49 | CScraperParser &CScraperParser::operator=(const CScraperParser &parser) | ||
| 50 | { | ||
| 51 | if (this != &parser) | ||
| 52 | { | ||
| 53 | Clear(); | ||
| 54 | if (parser.m_document) | ||
| 55 | { | ||
| 56 | m_scraper = parser.m_scraper; | ||
| 57 | m_document = new CXBMCTinyXML(*parser.m_document); | ||
| 58 | LoadFromXML(); | ||
| 59 | } | ||
| 60 | else | ||
| 61 | m_scraper = NULL; | ||
| 62 | } | ||
| 63 | return *this; | ||
| 64 | } | ||
| 65 | |||
| 66 | CScraperParser::~CScraperParser() | ||
| 67 | { | ||
| 68 | Clear(); | ||
| 69 | } | ||
| 70 | |||
| 71 | void CScraperParser::Clear() | ||
| 72 | { | ||
| 73 | m_pRootElement = NULL; | ||
| 74 | delete m_document; | ||
| 75 | |||
| 76 | m_document = NULL; | ||
| 77 | m_strFile.clear(); | ||
| 78 | } | ||
| 79 | |||
| 80 | bool CScraperParser::Load(const std::string& strXMLFile) | ||
| 81 | { | ||
| 82 | Clear(); | ||
| 83 | |||
| 84 | m_document = new CXBMCTinyXML(); | ||
| 85 | |||
| 86 | if (!m_document) | ||
| 87 | return false; | ||
| 88 | |||
| 89 | m_strFile = strXMLFile; | ||
| 90 | |||
| 91 | if (m_document->LoadFile(strXMLFile)) | ||
| 92 | return LoadFromXML(); | ||
| 93 | |||
| 94 | delete m_document; | ||
| 95 | m_document = NULL; | ||
| 96 | return false; | ||
| 97 | } | ||
| 98 | |||
| 99 | bool CScraperParser::LoadFromXML() | ||
| 100 | { | ||
| 101 | if (!m_document) | ||
| 102 | return false; | ||
| 103 | |||
| 104 | m_pRootElement = m_document->RootElement(); | ||
| 105 | std::string strValue = m_pRootElement->ValueStr(); | ||
| 106 | if (strValue == "scraper") | ||
| 107 | { | ||
| 108 | TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl"); | ||
| 109 | if (pChildElement) | ||
| 110 | { | ||
| 111 | m_isNoop = false; | ||
| 112 | if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding"))) | ||
| 113 | m_SearchStringEncoding = "UTF-8"; | ||
| 114 | } | ||
| 115 | |||
| 116 | pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl"); | ||
| 117 | if (pChildElement) | ||
| 118 | { | ||
| 119 | m_isNoop = false; | ||
| 120 | if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding"))) | ||
| 121 | m_SearchStringEncoding = "UTF-8"; | ||
| 122 | } | ||
| 123 | pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl"); | ||
| 124 | if (pChildElement) | ||
| 125 | { | ||
| 126 | m_isNoop = false; | ||
| 127 | if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding"))) | ||
| 128 | m_SearchStringEncoding = "UTF-8"; | ||
| 129 | } | ||
| 130 | |||
| 131 | return true; | ||
| 132 | } | ||
| 133 | |||
| 134 | delete m_document; | ||
| 135 | m_document = NULL; | ||
| 136 | m_pRootElement = NULL; | ||
| 137 | return false; | ||
| 138 | } | ||
| 139 | |||
| 140 | void CScraperParser::ReplaceBuffers(std::string& strDest) | ||
| 141 | { | ||
| 142 | // insert buffers | ||
| 143 | size_t iIndex; | ||
| 144 | for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--) | ||
| 145 | { | ||
| 146 | iIndex = 0; | ||
| 147 | std::string temp = StringUtils::Format("$$%i",i+1); | ||
| 148 | while ((iIndex = strDest.find(temp,iIndex)) != std::string::npos) | ||
| 149 | { | ||
| 150 | strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.size(),m_param[i]); | ||
| 151 | iIndex += m_param[i].length(); | ||
| 152 | } | ||
| 153 | } | ||
| 154 | // insert settings | ||
| 155 | iIndex = 0; | ||
| 156 | while ((iIndex = strDest.find("$INFO[", iIndex)) != std::string::npos) | ||
| 157 | { | ||
| 158 | size_t iEnd = strDest.find("]", iIndex); | ||
| 159 | std::string strInfo = strDest.substr(iIndex+6, iEnd - iIndex - 6); | ||
| 160 | std::string strReplace; | ||
| 161 | if (m_scraper) | ||
| 162 | strReplace = m_scraper->GetSetting(strInfo); | ||
| 163 | strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace); | ||
| 164 | iIndex += strReplace.length(); | ||
| 165 | } | ||
| 166 | // insert localize strings | ||
| 167 | iIndex = 0; | ||
| 168 | while ((iIndex = strDest.find("$LOCALIZE[", iIndex)) != std::string::npos) | ||
| 169 | { | ||
| 170 | size_t iEnd = strDest.find("]", iIndex); | ||
| 171 | std::string strInfo = strDest.substr(iIndex+10, iEnd - iIndex - 10); | ||
| 172 | std::string strReplace; | ||
| 173 | if (m_scraper) | ||
| 174 | strReplace = g_localizeStrings.GetAddonString(m_scraper->ID(), strtol(strInfo.c_str(),NULL,10)); | ||
| 175 | strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace); | ||
| 176 | iIndex += strReplace.length(); | ||
| 177 | } | ||
| 178 | iIndex = 0; | ||
| 179 | while ((iIndex = strDest.find("\\n",iIndex)) != std::string::npos) | ||
| 180 | strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n"); | ||
| 181 | } | ||
| 182 | |||
| 183 | void CScraperParser::ParseExpression(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend) | ||
| 184 | { | ||
| 185 | std::string strOutput = XMLUtils::GetAttribute(element, "output"); | ||
| 186 | |||
| 187 | TiXmlElement* pExpression = element->FirstChildElement("expression"); | ||
| 188 | if (pExpression) | ||
| 189 | { | ||
| 190 | bool bInsensitive=true; | ||
| 191 | const char* sensitive = pExpression->Attribute("cs"); | ||
| 192 | if (sensitive) | ||
| 193 | if (StringUtils::CompareNoCase(sensitive, "yes") == 0) | ||
| 194 | bInsensitive=false; // match case sensitive | ||
| 195 | |||
| 196 | CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8; | ||
| 197 | const char* const strUtf8 = pExpression->Attribute("utf8"); | ||
| 198 | if (strUtf8) | ||
| 199 | { | ||
| 200 | if (StringUtils::CompareNoCase(strUtf8, "yes") == 0) | ||
| 201 | eUtf8 = CRegExp::forceUtf8; | ||
| 202 | else if (StringUtils::CompareNoCase(strUtf8, "no") == 0) | ||
| 203 | eUtf8 = CRegExp::asciiOnly; | ||
| 204 | else if (StringUtils::CompareNoCase(strUtf8, "auto") == 0) | ||
| 205 | eUtf8 = CRegExp::autoUtf8; | ||
| 206 | } | ||
| 207 | |||
| 208 | CRegExp reg(bInsensitive, eUtf8); | ||
| 209 | std::string strExpression; | ||
| 210 | if (pExpression->FirstChild()) | ||
| 211 | strExpression = pExpression->FirstChild()->Value(); | ||
| 212 | else | ||
| 213 | strExpression = "(.*)"; | ||
| 214 | ReplaceBuffers(strExpression); | ||
| 215 | ReplaceBuffers(strOutput); | ||
| 216 | |||
| 217 | if (!reg.RegComp(strExpression.c_str())) | ||
| 218 | { | ||
| 219 | return; | ||
| 220 | } | ||
| 221 | |||
| 222 | bool bRepeat = false; | ||
| 223 | const char* szRepeat = pExpression->Attribute("repeat"); | ||
| 224 | if (szRepeat) | ||
| 225 | if (StringUtils::CompareNoCase(szRepeat, "yes") == 0) | ||
| 226 | bRepeat = true; | ||
| 227 | |||
| 228 | const char* szClear = pExpression->Attribute("clear"); | ||
| 229 | if (szClear) | ||
| 230 | if (StringUtils::CompareNoCase(szClear, "yes") == 0) | ||
| 231 | dest=""; // clear no matter if regexp fails | ||
| 232 | |||
| 233 | bool bClean[MAX_SCRAPER_BUFFERS]; | ||
| 234 | GetBufferParams(bClean,pExpression->Attribute("noclean"),true); | ||
| 235 | |||
| 236 | bool bTrim[MAX_SCRAPER_BUFFERS]; | ||
| 237 | GetBufferParams(bTrim,pExpression->Attribute("trim"),false); | ||
| 238 | |||
| 239 | bool bFixChars[MAX_SCRAPER_BUFFERS]; | ||
| 240 | GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false); | ||
| 241 | |||
| 242 | bool bEncode[MAX_SCRAPER_BUFFERS]; | ||
| 243 | GetBufferParams(bEncode,pExpression->Attribute("encode"),false); | ||
| 244 | |||
| 245 | int iOptional = -1; | ||
| 246 | pExpression->QueryIntAttribute("optional",&iOptional); | ||
| 247 | |||
| 248 | int iCompare = -1; | ||
| 249 | pExpression->QueryIntAttribute("compare",&iCompare); | ||
| 250 | if (iCompare > -1) | ||
| 251 | StringUtils::ToLower(m_param[iCompare-1]); | ||
| 252 | std::string curInput = input; | ||
| 253 | for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf) | ||
| 254 | { | ||
| 255 | if (bClean[iBuf]) | ||
| 256 | InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!"); | ||
| 257 | if (bTrim[iBuf]) | ||
| 258 | InsertToken(strOutput,iBuf+1,"!!!TRIM!!!"); | ||
| 259 | if (bFixChars[iBuf]) | ||
| 260 | InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!"); | ||
| 261 | if (bEncode[iBuf]) | ||
| 262 | InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!"); | ||
| 263 | } | ||
| 264 | int i = reg.RegFind(curInput.c_str()); | ||
| 265 | while (i > -1 && (i < (int)curInput.size() || curInput.empty())) | ||
| 266 | { | ||
| 267 | if (!bAppend) | ||
| 268 | { | ||
| 269 | dest = ""; | ||
| 270 | bAppend = true; | ||
| 271 | } | ||
| 272 | std::string strCurOutput=strOutput; | ||
| 273 | |||
| 274 | if (iOptional > -1) // check that required param is there | ||
| 275 | { | ||
| 276 | char temp[12]; | ||
| 277 | sprintf(temp,"\\%i",iOptional); | ||
| 278 | std::string szParam = reg.GetReplaceString(temp); | ||
| 279 | CRegExp reg2; | ||
| 280 | reg2.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)"); | ||
| 281 | int i2=reg2.RegFind(strCurOutput.c_str()); | ||
| 282 | while (i2 > -1) | ||
| 283 | { | ||
| 284 | std::string szRemove(reg2.GetMatch(2)); | ||
| 285 | int iRemove = szRemove.size(); | ||
| 286 | int i3 = strCurOutput.find(szRemove); | ||
| 287 | if (!szParam.empty()) | ||
| 288 | { | ||
| 289 | strCurOutput.erase(i3+iRemove,2); | ||
| 290 | strCurOutput.erase(i3,2); | ||
| 291 | } | ||
| 292 | else | ||
| 293 | strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,""); | ||
| 294 | |||
| 295 | i2 = reg2.RegFind(strCurOutput.c_str()); | ||
| 296 | } | ||
| 297 | } | ||
| 298 | |||
| 299 | int iLen = reg.GetFindLen(); | ||
| 300 | // nasty hack #1 - & means \0 in a replace string | ||
| 301 | StringUtils::Replace(strCurOutput, "&","!!!AMPAMP!!!"); | ||
| 302 | std::string result = reg.GetReplaceString(strCurOutput.c_str()); | ||
| 303 | if (!result.empty()) | ||
| 304 | { | ||
| 305 | std::string strResult(result); | ||
| 306 | StringUtils::Replace(strResult, "!!!AMPAMP!!!","&"); | ||
| 307 | Clean(strResult); | ||
| 308 | ReplaceBuffers(strResult); | ||
| 309 | if (iCompare > -1) | ||
| 310 | { | ||
| 311 | std::string strResultNoCase = strResult; | ||
| 312 | StringUtils::ToLower(strResultNoCase); | ||
| 313 | if (strResultNoCase.find(m_param[iCompare-1]) != std::string::npos) | ||
| 314 | dest += strResult; | ||
| 315 | } | ||
| 316 | else | ||
| 317 | dest += strResult; | ||
| 318 | } | ||
| 319 | if (bRepeat && iLen > 0) | ||
| 320 | { | ||
| 321 | curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen); | ||
| 322 | i = reg.RegFind(curInput.c_str()); | ||
| 323 | } | ||
| 324 | else | ||
| 325 | i = -1; | ||
| 326 | } | ||
| 327 | } | ||
| 328 | } | ||
| 329 | |||
| 330 | void CScraperParser::ParseXSLT(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend) | ||
| 331 | { | ||
| 332 | #ifdef HAVE_LIBXSLT | ||
| 333 | TiXmlElement* pSheet = element->FirstChildElement(); | ||
| 334 | if (pSheet) | ||
| 335 | { | ||
| 336 | XSLTUtils xsltUtils; | ||
| 337 | std::string strXslt; | ||
| 338 | strXslt << *pSheet; | ||
| 339 | ReplaceBuffers(strXslt); | ||
| 340 | |||
| 341 | if (!xsltUtils.SetInput(input)) | ||
| 342 | CLog::Log(LOGDEBUG, "could not parse input XML"); | ||
| 343 | |||
| 344 | if (!xsltUtils.SetStylesheet(strXslt)) | ||
| 345 | CLog::Log(LOGDEBUG, "could not parse stylesheet XML"); | ||
| 346 | |||
| 347 | xsltUtils.XSLTTransform(dest); | ||
| 348 | } | ||
| 349 | #endif | ||
| 350 | } | ||
| 351 | |||
| 352 | TiXmlElement *FirstChildScraperElement(TiXmlElement *element) | ||
| 353 | { | ||
| 354 | for (TiXmlElement *child = element->FirstChildElement(); child; child = child->NextSiblingElement()) | ||
| 355 | { | ||
| 356 | #ifdef HAVE_LIBXSLT | ||
| 357 | if (child->ValueStr() == "XSLT") | ||
| 358 | return child; | ||
| 359 | #endif | ||
| 360 | if (child->ValueStr() == "RegExp") | ||
| 361 | return child; | ||
| 362 | } | ||
| 363 | return NULL; | ||
| 364 | } | ||
| 365 | |||
| 366 | TiXmlElement *NextSiblingScraperElement(TiXmlElement *element) | ||
| 367 | { | ||
| 368 | for (TiXmlElement *next = element->NextSiblingElement(); next; next = next->NextSiblingElement()) | ||
| 369 | { | ||
| 370 | #ifdef HAVE_LIBXSLT | ||
| 371 | if (next->ValueStr() == "XSLT") | ||
| 372 | return next; | ||
| 373 | #endif | ||
| 374 | if (next->ValueStr() == "RegExp") | ||
| 375 | return next; | ||
| 376 | } | ||
| 377 | return NULL; | ||
| 378 | } | ||
| 379 | |||
| 380 | void CScraperParser::ParseNext(TiXmlElement* element) | ||
| 381 | { | ||
| 382 | TiXmlElement* pReg = element; | ||
| 383 | while (pReg) | ||
| 384 | { | ||
| 385 | TiXmlElement* pChildReg = FirstChildScraperElement(pReg); | ||
| 386 | if (pChildReg) | ||
| 387 | ParseNext(pChildReg); | ||
| 388 | else | ||
| 389 | { | ||
| 390 | TiXmlElement* pChildReg = pReg->FirstChildElement("clear"); | ||
| 391 | if (pChildReg) | ||
| 392 | ParseNext(pChildReg); | ||
| 393 | } | ||
| 394 | |||
| 395 | int iDest = 1; | ||
| 396 | bool bAppend = false; | ||
| 397 | const char* szDest = pReg->Attribute("dest"); | ||
| 398 | if (szDest && strlen(szDest)) | ||
| 399 | { | ||
| 400 | if (szDest[strlen(szDest)-1] == '+') | ||
| 401 | bAppend = true; | ||
| 402 | |||
| 403 | iDest = atoi(szDest); | ||
| 404 | } | ||
| 405 | |||
| 406 | const char *szInput = pReg->Attribute("input"); | ||
| 407 | std::string strInput; | ||
| 408 | if (szInput) | ||
| 409 | { | ||
| 410 | strInput = szInput; | ||
| 411 | ReplaceBuffers(strInput); | ||
| 412 | } | ||
| 413 | else | ||
| 414 | strInput = m_param[0]; | ||
| 415 | |||
| 416 | const char* szConditional = pReg->Attribute("conditional"); | ||
| 417 | bool bExecute = true; | ||
| 418 | if (szConditional) | ||
| 419 | { | ||
| 420 | bool bInverse=false; | ||
| 421 | if (szConditional[0] == '!') | ||
| 422 | { | ||
| 423 | bInverse = true; | ||
| 424 | szConditional++; | ||
| 425 | } | ||
| 426 | std::string strSetting; | ||
| 427 | if (m_scraper && m_scraper->HasSettings()) | ||
| 428 | strSetting = m_scraper->GetSetting(szConditional); | ||
| 429 | bExecute = bInverse != (strSetting == "true"); | ||
| 430 | } | ||
| 431 | |||
| 432 | if (bExecute) | ||
| 433 | { | ||
| 434 | if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1) | ||
| 435 | { | ||
| 436 | #ifdef HAVE_LIBXSLT | ||
| 437 | if (pReg->ValueStr() == "XSLT") | ||
| 438 | ParseXSLT(strInput, m_param[iDest - 1], pReg, bAppend); | ||
| 439 | else | ||
| 440 | #endif | ||
| 441 | ParseExpression(strInput, m_param[iDest - 1],pReg,bAppend); | ||
| 442 | } | ||
| 443 | else | ||
| 444 | CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer " | ||
| 445 | "out of bounds, skipping expression"); | ||
| 446 | } | ||
| 447 | pReg = NextSiblingScraperElement(pReg); | ||
| 448 | } | ||
| 449 | } | ||
| 450 | |||
| 451 | const std::string CScraperParser::Parse(const std::string& strTag, | ||
| 452 | CScraper* scraper) | ||
| 453 | { | ||
| 454 | TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str()); | ||
| 455 | if(pChildElement == NULL) | ||
| 456 | { | ||
| 457 | CLog::Log(LOGERROR,"%s: Could not find scraper function %s",__FUNCTION__,strTag.c_str()); | ||
| 458 | return ""; | ||
| 459 | } | ||
| 460 | int iResult = 1; // default to param 1 | ||
| 461 | pChildElement->QueryIntAttribute("dest",&iResult); | ||
| 462 | TiXmlElement* pChildStart = FirstChildScraperElement(pChildElement); | ||
| 463 | m_scraper = scraper; | ||
| 464 | ParseNext(pChildStart); | ||
| 465 | std::string tmp = m_param[iResult-1]; | ||
| 466 | |||
| 467 | const char* szClearBuffers = pChildElement->Attribute("clearbuffers"); | ||
| 468 | if (!szClearBuffers || StringUtils::CompareNoCase(szClearBuffers, "no") != 0) | ||
| 469 | ClearBuffers(); | ||
| 470 | |||
| 471 | return tmp; | ||
| 472 | } | ||
| 473 | |||
| 474 | void CScraperParser::Clean(std::string& strDirty) | ||
| 475 | { | ||
| 476 | size_t i = 0; | ||
| 477 | std::string strBuffer; | ||
| 478 | while ((i = strDirty.find("!!!CLEAN!!!",i)) != std::string::npos) | ||
| 479 | { | ||
| 480 | size_t i2; | ||
| 481 | if ((i2 = strDirty.find("!!!CLEAN!!!",i+11)) != std::string::npos) | ||
| 482 | { | ||
| 483 | strBuffer = strDirty.substr(i+11,i2-i-11); | ||
| 484 | std::string strConverted(strBuffer); | ||
| 485 | HTML::CHTMLUtil::RemoveTags(strConverted); | ||
| 486 | StringUtils::Trim(strConverted); | ||
| 487 | strDirty.replace(i, i2-i+11, strConverted); | ||
| 488 | i += strConverted.size(); | ||
| 489 | } | ||
| 490 | else | ||
| 491 | break; | ||
| 492 | } | ||
| 493 | i=0; | ||
| 494 | while ((i = strDirty.find("!!!TRIM!!!",i)) != std::string::npos) | ||
| 495 | { | ||
| 496 | size_t i2; | ||
| 497 | if ((i2 = strDirty.find("!!!TRIM!!!",i+10)) != std::string::npos) | ||
| 498 | { | ||
| 499 | strBuffer = strDirty.substr(i+10,i2-i-10); | ||
| 500 | StringUtils::Trim(strBuffer); | ||
| 501 | strDirty.replace(i, i2-i+10, strBuffer); | ||
| 502 | i += strBuffer.size(); | ||
| 503 | } | ||
| 504 | else | ||
| 505 | break; | ||
| 506 | } | ||
| 507 | i=0; | ||
| 508 | while ((i = strDirty.find("!!!FIXCHARS!!!",i)) != std::string::npos) | ||
| 509 | { | ||
| 510 | size_t i2; | ||
| 511 | if ((i2 = strDirty.find("!!!FIXCHARS!!!",i+14)) != std::string::npos) | ||
| 512 | { | ||
| 513 | strBuffer = strDirty.substr(i+14,i2-i-14); | ||
| 514 | std::wstring wbuffer; | ||
| 515 | g_charsetConverter.utf8ToW(strBuffer, wbuffer, false, false, false); | ||
| 516 | std::wstring wConverted; | ||
| 517 | HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted); | ||
| 518 | g_charsetConverter.wToUTF8(wConverted, strBuffer, false); | ||
| 519 | StringUtils::Trim(strBuffer); | ||
| 520 | ConvertJSON(strBuffer); | ||
| 521 | strDirty.replace(i, i2-i+14, strBuffer); | ||
| 522 | i += strBuffer.size(); | ||
| 523 | } | ||
| 524 | else | ||
| 525 | break; | ||
| 526 | } | ||
| 527 | i=0; | ||
| 528 | while ((i=strDirty.find("!!!ENCODE!!!",i)) != std::string::npos) | ||
| 529 | { | ||
| 530 | size_t i2; | ||
| 531 | if ((i2 = strDirty.find("!!!ENCODE!!!",i+12)) != std::string::npos) | ||
| 532 | { | ||
| 533 | strBuffer = CURL::Encode(strDirty.substr(i + 12, i2 - i - 12)); | ||
| 534 | strDirty.replace(i, i2-i+12, strBuffer); | ||
| 535 | i += strBuffer.size(); | ||
| 536 | } | ||
| 537 | else | ||
| 538 | break; | ||
| 539 | } | ||
| 540 | } | ||
| 541 | |||
| 542 | void CScraperParser::ConvertJSON(std::string &string) | ||
| 543 | { | ||
| 544 | CRegExp reg; | ||
| 545 | reg.RegComp("\\\\u([0-f]{4})"); | ||
| 546 | while (reg.RegFind(string.c_str()) > -1) | ||
| 547 | { | ||
| 548 | int pos = reg.GetSubStart(1); | ||
| 549 | std::string szReplace(reg.GetMatch(1)); | ||
| 550 | |||
| 551 | std::string replace = StringUtils::Format("&#x%s;", szReplace.c_str()); | ||
| 552 | string.replace(string.begin()+pos-2, string.begin()+pos+4, replace); | ||
| 553 | } | ||
| 554 | |||
| 555 | CRegExp reg2; | ||
| 556 | reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)"); | ||
| 557 | while (reg2.RegFind(string.c_str()) > -1) | ||
| 558 | { | ||
| 559 | int pos1 = reg2.GetSubStart(1); | ||
| 560 | int pos2 = reg2.GetSubStart(2); | ||
| 561 | std::string szHexValue(reg2.GetMatch(1)); | ||
| 562 | |||
| 563 | std::string replace = StringUtils::Format("%li", strtol(szHexValue.c_str(), NULL, 16)); | ||
| 564 | string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace); | ||
| 565 | } | ||
| 566 | |||
| 567 | StringUtils::Replace(string, "\\\"","\""); | ||
| 568 | } | ||
| 569 | |||
| 570 | void CScraperParser::ClearBuffers() | ||
| 571 | { | ||
| 572 | //clear all m_param strings | ||
| 573 | for (std::string& param : m_param) | ||
| 574 | param.clear(); | ||
| 575 | } | ||
| 576 | |||
| 577 | void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue) | ||
| 578 | { | ||
| 579 | for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf) | ||
| 580 | result[iBuf] = defvalue; | ||
| 581 | if (attribute) | ||
| 582 | { | ||
| 583 | std::vector<std::string> vecBufs; | ||
| 584 | StringUtils::Tokenize(attribute,vecBufs,","); | ||
| 585 | for (size_t nToken=0; nToken < vecBufs.size(); nToken++) | ||
| 586 | { | ||
| 587 | int index = atoi(vecBufs[nToken].c_str())-1; | ||
| 588 | if (index < MAX_SCRAPER_BUFFERS) | ||
| 589 | result[index] = !defvalue; | ||
| 590 | } | ||
| 591 | } | ||
| 592 | } | ||
| 593 | |||
| 594 | void CScraperParser::InsertToken(std::string& strOutput, int buf, const char* token) | ||
| 595 | { | ||
| 596 | char temp[4]; | ||
| 597 | sprintf(temp,"\\%i",buf); | ||
| 598 | size_t i2=0; | ||
| 599 | while ((i2 = strOutput.find(temp,i2)) != std::string::npos) | ||
| 600 | { | ||
| 601 | strOutput.insert(i2,token); | ||
| 602 | i2 += strlen(token) + strlen(temp); | ||
| 603 | strOutput.insert(i2,token); | ||
| 604 | } | ||
| 605 | } | ||
| 606 | |||
| 607 | void CScraperParser::AddDocument(const CXBMCTinyXML* doc) | ||
| 608 | { | ||
| 609 | const TiXmlNode* node = doc->RootElement()->FirstChild(); | ||
| 610 | while (node) | ||
| 611 | { | ||
| 612 | m_pRootElement->InsertEndChild(*node); | ||
| 613 | node = node->NextSibling(); | ||
| 614 | } | ||
| 615 | } | ||
| 616 | |||
