summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/ScraperParser.cpp
diff options
context:
space:
mode:
authormanuel <manuel@mausz.at>2020-10-19 00:52:24 +0200
committermanuel <manuel@mausz.at>2020-10-19 00:52:24 +0200
commitbe933ef2241d79558f91796cc5b3a161f72ebf9c (patch)
treefe3ab2f130e20c99001f2d7a81d610c78c96a3f4 /xbmc/utils/ScraperParser.cpp
parent5f8335c1e49ce108ef3481863833c98efa00411b (diff)
downloadkodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.gz
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.tar.bz2
kodi-pvr-build-be933ef2241d79558f91796cc5b3a161f72ebf9c.zip
sync with upstream
Diffstat (limited to 'xbmc/utils/ScraperParser.cpp')
-rw-r--r--xbmc/utils/ScraperParser.cpp616
1 files changed, 616 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperParser.cpp b/xbmc/utils/ScraperParser.cpp
new file mode 100644
index 0000000..81fcf37
--- /dev/null
+++ b/xbmc/utils/ScraperParser.cpp
@@ -0,0 +1,616 @@
1/*
2 * Copyright (C) 2012-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
8
9#include "ScraperParser.h"
10
11#include "addons/AddonManager.h"
12#include "guilib/LocalizeStrings.h"
13#include "RegExp.h"
14#include "HTMLUtil.h"
15#include "addons/Scraper.h"
16#include "URL.h"
17#include "utils/StringUtils.h"
18#include "log.h"
19#include "CharsetConverter.h"
20#ifdef HAVE_LIBXSLT
21#include "utils/XSLTUtils.h"
22#endif
23#include "utils/XMLUtils.h"
24#include <sstream>
25#include <cstring>
26
27using namespace ADDON;
28using namespace XFILE;
29
30CScraperParser::CScraperParser()
31{
32 m_pRootElement = NULL;
33 m_document = NULL;
34 m_SearchStringEncoding = "UTF-8";
35 m_scraper = NULL;
36 m_isNoop = true;
37}
38
39CScraperParser::CScraperParser(const CScraperParser& parser)
40{
41 m_pRootElement = NULL;
42 m_document = NULL;
43 m_SearchStringEncoding = "UTF-8";
44 m_scraper = NULL;
45 m_isNoop = true;
46 *this = parser;
47}
48
49CScraperParser &CScraperParser::operator=(const CScraperParser &parser)
50{
51 if (this != &parser)
52 {
53 Clear();
54 if (parser.m_document)
55 {
56 m_scraper = parser.m_scraper;
57 m_document = new CXBMCTinyXML(*parser.m_document);
58 LoadFromXML();
59 }
60 else
61 m_scraper = NULL;
62 }
63 return *this;
64}
65
66CScraperParser::~CScraperParser()
67{
68 Clear();
69}
70
71void CScraperParser::Clear()
72{
73 m_pRootElement = NULL;
74 delete m_document;
75
76 m_document = NULL;
77 m_strFile.clear();
78}
79
80bool CScraperParser::Load(const std::string& strXMLFile)
81{
82 Clear();
83
84 m_document = new CXBMCTinyXML();
85
86 if (!m_document)
87 return false;
88
89 m_strFile = strXMLFile;
90
91 if (m_document->LoadFile(strXMLFile))
92 return LoadFromXML();
93
94 delete m_document;
95 m_document = NULL;
96 return false;
97}
98
99bool CScraperParser::LoadFromXML()
100{
101 if (!m_document)
102 return false;
103
104 m_pRootElement = m_document->RootElement();
105 std::string strValue = m_pRootElement->ValueStr();
106 if (strValue == "scraper")
107 {
108 TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl");
109 if (pChildElement)
110 {
111 m_isNoop = false;
112 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
113 m_SearchStringEncoding = "UTF-8";
114 }
115
116 pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl");
117 if (pChildElement)
118 {
119 m_isNoop = false;
120 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
121 m_SearchStringEncoding = "UTF-8";
122 }
123 pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl");
124 if (pChildElement)
125 {
126 m_isNoop = false;
127 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
128 m_SearchStringEncoding = "UTF-8";
129 }
130
131 return true;
132 }
133
134 delete m_document;
135 m_document = NULL;
136 m_pRootElement = NULL;
137 return false;
138}
139
140void CScraperParser::ReplaceBuffers(std::string& strDest)
141{
142 // insert buffers
143 size_t iIndex;
144 for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--)
145 {
146 iIndex = 0;
147 std::string temp = StringUtils::Format("$$%i",i+1);
148 while ((iIndex = strDest.find(temp,iIndex)) != std::string::npos)
149 {
150 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.size(),m_param[i]);
151 iIndex += m_param[i].length();
152 }
153 }
154 // insert settings
155 iIndex = 0;
156 while ((iIndex = strDest.find("$INFO[", iIndex)) != std::string::npos)
157 {
158 size_t iEnd = strDest.find("]", iIndex);
159 std::string strInfo = strDest.substr(iIndex+6, iEnd - iIndex - 6);
160 std::string strReplace;
161 if (m_scraper)
162 strReplace = m_scraper->GetSetting(strInfo);
163 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
164 iIndex += strReplace.length();
165 }
166 // insert localize strings
167 iIndex = 0;
168 while ((iIndex = strDest.find("$LOCALIZE[", iIndex)) != std::string::npos)
169 {
170 size_t iEnd = strDest.find("]", iIndex);
171 std::string strInfo = strDest.substr(iIndex+10, iEnd - iIndex - 10);
172 std::string strReplace;
173 if (m_scraper)
174 strReplace = g_localizeStrings.GetAddonString(m_scraper->ID(), strtol(strInfo.c_str(),NULL,10));
175 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
176 iIndex += strReplace.length();
177 }
178 iIndex = 0;
179 while ((iIndex = strDest.find("\\n",iIndex)) != std::string::npos)
180 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n");
181}
182
183void CScraperParser::ParseExpression(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
184{
185 std::string strOutput = XMLUtils::GetAttribute(element, "output");
186
187 TiXmlElement* pExpression = element->FirstChildElement("expression");
188 if (pExpression)
189 {
190 bool bInsensitive=true;
191 const char* sensitive = pExpression->Attribute("cs");
192 if (sensitive)
193 if (StringUtils::CompareNoCase(sensitive, "yes") == 0)
194 bInsensitive=false; // match case sensitive
195
196 CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8;
197 const char* const strUtf8 = pExpression->Attribute("utf8");
198 if (strUtf8)
199 {
200 if (StringUtils::CompareNoCase(strUtf8, "yes") == 0)
201 eUtf8 = CRegExp::forceUtf8;
202 else if (StringUtils::CompareNoCase(strUtf8, "no") == 0)
203 eUtf8 = CRegExp::asciiOnly;
204 else if (StringUtils::CompareNoCase(strUtf8, "auto") == 0)
205 eUtf8 = CRegExp::autoUtf8;
206 }
207
208 CRegExp reg(bInsensitive, eUtf8);
209 std::string strExpression;
210 if (pExpression->FirstChild())
211 strExpression = pExpression->FirstChild()->Value();
212 else
213 strExpression = "(.*)";
214 ReplaceBuffers(strExpression);
215 ReplaceBuffers(strOutput);
216
217 if (!reg.RegComp(strExpression.c_str()))
218 {
219 return;
220 }
221
222 bool bRepeat = false;
223 const char* szRepeat = pExpression->Attribute("repeat");
224 if (szRepeat)
225 if (StringUtils::CompareNoCase(szRepeat, "yes") == 0)
226 bRepeat = true;
227
228 const char* szClear = pExpression->Attribute("clear");
229 if (szClear)
230 if (StringUtils::CompareNoCase(szClear, "yes") == 0)
231 dest=""; // clear no matter if regexp fails
232
233 bool bClean[MAX_SCRAPER_BUFFERS];
234 GetBufferParams(bClean,pExpression->Attribute("noclean"),true);
235
236 bool bTrim[MAX_SCRAPER_BUFFERS];
237 GetBufferParams(bTrim,pExpression->Attribute("trim"),false);
238
239 bool bFixChars[MAX_SCRAPER_BUFFERS];
240 GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false);
241
242 bool bEncode[MAX_SCRAPER_BUFFERS];
243 GetBufferParams(bEncode,pExpression->Attribute("encode"),false);
244
245 int iOptional = -1;
246 pExpression->QueryIntAttribute("optional",&iOptional);
247
248 int iCompare = -1;
249 pExpression->QueryIntAttribute("compare",&iCompare);
250 if (iCompare > -1)
251 StringUtils::ToLower(m_param[iCompare-1]);
252 std::string curInput = input;
253 for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
254 {
255 if (bClean[iBuf])
256 InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!");
257 if (bTrim[iBuf])
258 InsertToken(strOutput,iBuf+1,"!!!TRIM!!!");
259 if (bFixChars[iBuf])
260 InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!");
261 if (bEncode[iBuf])
262 InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!");
263 }
264 int i = reg.RegFind(curInput.c_str());
265 while (i > -1 && (i < (int)curInput.size() || curInput.empty()))
266 {
267 if (!bAppend)
268 {
269 dest = "";
270 bAppend = true;
271 }
272 std::string strCurOutput=strOutput;
273
274 if (iOptional > -1) // check that required param is there
275 {
276 char temp[12];
277 sprintf(temp,"\\%i",iOptional);
278 std::string szParam = reg.GetReplaceString(temp);
279 CRegExp reg2;
280 reg2.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)");
281 int i2=reg2.RegFind(strCurOutput.c_str());
282 while (i2 > -1)
283 {
284 std::string szRemove(reg2.GetMatch(2));
285 int iRemove = szRemove.size();
286 int i3 = strCurOutput.find(szRemove);
287 if (!szParam.empty())
288 {
289 strCurOutput.erase(i3+iRemove,2);
290 strCurOutput.erase(i3,2);
291 }
292 else
293 strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,"");
294
295 i2 = reg2.RegFind(strCurOutput.c_str());
296 }
297 }
298
299 int iLen = reg.GetFindLen();
300 // nasty hack #1 - & means \0 in a replace string
301 StringUtils::Replace(strCurOutput, "&","!!!AMPAMP!!!");
302 std::string result = reg.GetReplaceString(strCurOutput.c_str());
303 if (!result.empty())
304 {
305 std::string strResult(result);
306 StringUtils::Replace(strResult, "!!!AMPAMP!!!","&");
307 Clean(strResult);
308 ReplaceBuffers(strResult);
309 if (iCompare > -1)
310 {
311 std::string strResultNoCase = strResult;
312 StringUtils::ToLower(strResultNoCase);
313 if (strResultNoCase.find(m_param[iCompare-1]) != std::string::npos)
314 dest += strResult;
315 }
316 else
317 dest += strResult;
318 }
319 if (bRepeat && iLen > 0)
320 {
321 curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen);
322 i = reg.RegFind(curInput.c_str());
323 }
324 else
325 i = -1;
326 }
327 }
328}
329
330void CScraperParser::ParseXSLT(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
331{
332#ifdef HAVE_LIBXSLT
333 TiXmlElement* pSheet = element->FirstChildElement();
334 if (pSheet)
335 {
336 XSLTUtils xsltUtils;
337 std::string strXslt;
338 strXslt << *pSheet;
339 ReplaceBuffers(strXslt);
340
341 if (!xsltUtils.SetInput(input))
342 CLog::Log(LOGDEBUG, "could not parse input XML");
343
344 if (!xsltUtils.SetStylesheet(strXslt))
345 CLog::Log(LOGDEBUG, "could not parse stylesheet XML");
346
347 xsltUtils.XSLTTransform(dest);
348 }
349#endif
350}
351
352TiXmlElement *FirstChildScraperElement(TiXmlElement *element)
353{
354 for (TiXmlElement *child = element->FirstChildElement(); child; child = child->NextSiblingElement())
355 {
356#ifdef HAVE_LIBXSLT
357 if (child->ValueStr() == "XSLT")
358 return child;
359#endif
360 if (child->ValueStr() == "RegExp")
361 return child;
362 }
363 return NULL;
364}
365
366TiXmlElement *NextSiblingScraperElement(TiXmlElement *element)
367{
368 for (TiXmlElement *next = element->NextSiblingElement(); next; next = next->NextSiblingElement())
369 {
370#ifdef HAVE_LIBXSLT
371 if (next->ValueStr() == "XSLT")
372 return next;
373#endif
374 if (next->ValueStr() == "RegExp")
375 return next;
376 }
377 return NULL;
378}
379
380void CScraperParser::ParseNext(TiXmlElement* element)
381{
382 TiXmlElement* pReg = element;
383 while (pReg)
384 {
385 TiXmlElement* pChildReg = FirstChildScraperElement(pReg);
386 if (pChildReg)
387 ParseNext(pChildReg);
388 else
389 {
390 TiXmlElement* pChildReg = pReg->FirstChildElement("clear");
391 if (pChildReg)
392 ParseNext(pChildReg);
393 }
394
395 int iDest = 1;
396 bool bAppend = false;
397 const char* szDest = pReg->Attribute("dest");
398 if (szDest && strlen(szDest))
399 {
400 if (szDest[strlen(szDest)-1] == '+')
401 bAppend = true;
402
403 iDest = atoi(szDest);
404 }
405
406 const char *szInput = pReg->Attribute("input");
407 std::string strInput;
408 if (szInput)
409 {
410 strInput = szInput;
411 ReplaceBuffers(strInput);
412 }
413 else
414 strInput = m_param[0];
415
416 const char* szConditional = pReg->Attribute("conditional");
417 bool bExecute = true;
418 if (szConditional)
419 {
420 bool bInverse=false;
421 if (szConditional[0] == '!')
422 {
423 bInverse = true;
424 szConditional++;
425 }
426 std::string strSetting;
427 if (m_scraper && m_scraper->HasSettings())
428 strSetting = m_scraper->GetSetting(szConditional);
429 bExecute = bInverse != (strSetting == "true");
430 }
431
432 if (bExecute)
433 {
434 if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1)
435 {
436#ifdef HAVE_LIBXSLT
437 if (pReg->ValueStr() == "XSLT")
438 ParseXSLT(strInput, m_param[iDest - 1], pReg, bAppend);
439 else
440#endif
441 ParseExpression(strInput, m_param[iDest - 1],pReg,bAppend);
442 }
443 else
444 CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer "
445 "out of bounds, skipping expression");
446 }
447 pReg = NextSiblingScraperElement(pReg);
448 }
449}
450
451const std::string CScraperParser::Parse(const std::string& strTag,
452 CScraper* scraper)
453{
454 TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str());
455 if(pChildElement == NULL)
456 {
457 CLog::Log(LOGERROR,"%s: Could not find scraper function %s",__FUNCTION__,strTag.c_str());
458 return "";
459 }
460 int iResult = 1; // default to param 1
461 pChildElement->QueryIntAttribute("dest",&iResult);
462 TiXmlElement* pChildStart = FirstChildScraperElement(pChildElement);
463 m_scraper = scraper;
464 ParseNext(pChildStart);
465 std::string tmp = m_param[iResult-1];
466
467 const char* szClearBuffers = pChildElement->Attribute("clearbuffers");
468 if (!szClearBuffers || StringUtils::CompareNoCase(szClearBuffers, "no") != 0)
469 ClearBuffers();
470
471 return tmp;
472}
473
474void CScraperParser::Clean(std::string& strDirty)
475{
476 size_t i = 0;
477 std::string strBuffer;
478 while ((i = strDirty.find("!!!CLEAN!!!",i)) != std::string::npos)
479 {
480 size_t i2;
481 if ((i2 = strDirty.find("!!!CLEAN!!!",i+11)) != std::string::npos)
482 {
483 strBuffer = strDirty.substr(i+11,i2-i-11);
484 std::string strConverted(strBuffer);
485 HTML::CHTMLUtil::RemoveTags(strConverted);
486 StringUtils::Trim(strConverted);
487 strDirty.replace(i, i2-i+11, strConverted);
488 i += strConverted.size();
489 }
490 else
491 break;
492 }
493 i=0;
494 while ((i = strDirty.find("!!!TRIM!!!",i)) != std::string::npos)
495 {
496 size_t i2;
497 if ((i2 = strDirty.find("!!!TRIM!!!",i+10)) != std::string::npos)
498 {
499 strBuffer = strDirty.substr(i+10,i2-i-10);
500 StringUtils::Trim(strBuffer);
501 strDirty.replace(i, i2-i+10, strBuffer);
502 i += strBuffer.size();
503 }
504 else
505 break;
506 }
507 i=0;
508 while ((i = strDirty.find("!!!FIXCHARS!!!",i)) != std::string::npos)
509 {
510 size_t i2;
511 if ((i2 = strDirty.find("!!!FIXCHARS!!!",i+14)) != std::string::npos)
512 {
513 strBuffer = strDirty.substr(i+14,i2-i-14);
514 std::wstring wbuffer;
515 g_charsetConverter.utf8ToW(strBuffer, wbuffer, false, false, false);
516 std::wstring wConverted;
517 HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted);
518 g_charsetConverter.wToUTF8(wConverted, strBuffer, false);
519 StringUtils::Trim(strBuffer);
520 ConvertJSON(strBuffer);
521 strDirty.replace(i, i2-i+14, strBuffer);
522 i += strBuffer.size();
523 }
524 else
525 break;
526 }
527 i=0;
528 while ((i=strDirty.find("!!!ENCODE!!!",i)) != std::string::npos)
529 {
530 size_t i2;
531 if ((i2 = strDirty.find("!!!ENCODE!!!",i+12)) != std::string::npos)
532 {
533 strBuffer = CURL::Encode(strDirty.substr(i + 12, i2 - i - 12));
534 strDirty.replace(i, i2-i+12, strBuffer);
535 i += strBuffer.size();
536 }
537 else
538 break;
539 }
540}
541
542void CScraperParser::ConvertJSON(std::string &string)
543{
544 CRegExp reg;
545 reg.RegComp("\\\\u([0-f]{4})");
546 while (reg.RegFind(string.c_str()) > -1)
547 {
548 int pos = reg.GetSubStart(1);
549 std::string szReplace(reg.GetMatch(1));
550
551 std::string replace = StringUtils::Format("&#x%s;", szReplace.c_str());
552 string.replace(string.begin()+pos-2, string.begin()+pos+4, replace);
553 }
554
555 CRegExp reg2;
556 reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
557 while (reg2.RegFind(string.c_str()) > -1)
558 {
559 int pos1 = reg2.GetSubStart(1);
560 int pos2 = reg2.GetSubStart(2);
561 std::string szHexValue(reg2.GetMatch(1));
562
563 std::string replace = StringUtils::Format("%li", strtol(szHexValue.c_str(), NULL, 16));
564 string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace);
565 }
566
567 StringUtils::Replace(string, "\\\"","\"");
568}
569
570void CScraperParser::ClearBuffers()
571{
572 //clear all m_param strings
573 for (std::string& param : m_param)
574 param.clear();
575}
576
577void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue)
578{
579 for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
580 result[iBuf] = defvalue;
581 if (attribute)
582 {
583 std::vector<std::string> vecBufs;
584 StringUtils::Tokenize(attribute,vecBufs,",");
585 for (size_t nToken=0; nToken < vecBufs.size(); nToken++)
586 {
587 int index = atoi(vecBufs[nToken].c_str())-1;
588 if (index < MAX_SCRAPER_BUFFERS)
589 result[index] = !defvalue;
590 }
591 }
592}
593
594void CScraperParser::InsertToken(std::string& strOutput, int buf, const char* token)
595{
596 char temp[4];
597 sprintf(temp,"\\%i",buf);
598 size_t i2=0;
599 while ((i2 = strOutput.find(temp,i2)) != std::string::npos)
600 {
601 strOutput.insert(i2,token);
602 i2 += strlen(token) + strlen(temp);
603 strOutput.insert(i2,token);
604 }
605}
606
607void CScraperParser::AddDocument(const CXBMCTinyXML* doc)
608{
609 const TiXmlNode* node = doc->RootElement()->FirstChild();
610 while (node)
611 {
612 m_pRootElement->InsertEndChild(*node);
613 node = node->NextSibling();
614 }
615}
616