summaryrefslogtreecommitdiffstats
path: root/xbmc/addons/Scraper.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'xbmc/addons/Scraper.cpp')
-rw-r--r--xbmc/addons/Scraper.cpp1033
1 files changed, 1033 insertions, 0 deletions
diff --git a/xbmc/addons/Scraper.cpp b/xbmc/addons/Scraper.cpp
new file mode 100644
index 0000000..06f34f2
--- /dev/null
+++ b/xbmc/addons/Scraper.cpp
@@ -0,0 +1,1033 @@
1/*
2* Copyright (C) 2005-2013 Team XBMC
3* http://xbmc.org
4*
5* This Program is free software; you can redistribute it and/or modify
6* it under the terms of the GNU General Public License as published by
7* the Free Software Foundation; either version 2, or (at your option)
8* any later version.
9*
10* This Program is distributed in the hope that it will be useful,
11* but WITHOUT ANY WARRANTY; without even the implied warranty of
12* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13* GNU General Public License for more details.
14*
15* You should have received a copy of the GNU General Public License
16* along with XBMC; see the file COPYING. If not, see
17* <http://www.gnu.org/licenses/>.
18*
19*/
20#include "Scraper.h"
21#include "filesystem/File.h"
22#include "filesystem/Directory.h"
23#include "filesystem/CurlFile.h"
24#include "AddonManager.h"
25#include "utils/ScraperParser.h"
26#include "utils/ScraperUrl.h"
27#include "utils/CharsetConverter.h"
28#include "utils/log.h"
29#include "music/infoscanner/MusicAlbumInfo.h"
30#include "music/infoscanner/MusicArtistInfo.h"
31#include "utils/fstrcmp.h"
32#include "settings/AdvancedSettings.h"
33#include "FileItem.h"
34#include "utils/URIUtils.h"
35#include "utils/XMLUtils.h"
36#include "utils/StringUtils.h"
37#include "music/MusicDatabase.h"
38#include "video/VideoDatabase.h"
39#include "music/Album.h"
40#include "music/Artist.h"
41#include "Util.h"
42#include "URL.h"
43
44#include <sstream>
45#include <algorithm>
46
47using namespace std;
48using namespace XFILE;
49using namespace MUSIC_GRABBER;
50using namespace VIDEO;
51
52namespace ADDON
53{
54
55typedef struct
56{
57 const char* name;
58 CONTENT_TYPE type;
59 int pretty;
60} ContentMapping;
61
62static const ContentMapping content[] =
63 {{"unknown", CONTENT_NONE, 231 },
64 {"albums", CONTENT_ALBUMS, 132 },
65 {"music", CONTENT_ALBUMS, 132 },
66 {"artists", CONTENT_ARTISTS, 133 },
67 {"movies", CONTENT_MOVIES, 20342 },
68 {"tvshows", CONTENT_TVSHOWS, 20343 },
69 {"musicvideos", CONTENT_MUSICVIDEOS, 20389 }};
70
71std::string TranslateContent(const CONTENT_TYPE &type, bool pretty/*=false*/)
72{
73 for (unsigned int index=0; index < ARRAY_SIZE(content); ++index)
74 {
75 const ContentMapping &map = content[index];
76 if (type == map.type)
77 {
78 if (pretty && map.pretty)
79 return g_localizeStrings.Get(map.pretty);
80 else
81 return map.name;
82 }
83 }
84 return "";
85}
86
87CONTENT_TYPE TranslateContent(const std::string &string)
88{
89 for (unsigned int index=0; index < ARRAY_SIZE(content); ++index)
90 {
91 const ContentMapping &map = content[index];
92 if (string == map.name)
93 return map.type;
94 }
95 return CONTENT_NONE;
96}
97
98TYPE ScraperTypeFromContent(const CONTENT_TYPE &content)
99{
100 switch (content)
101 {
102 case CONTENT_ALBUMS:
103 return ADDON_SCRAPER_ALBUMS;
104 case CONTENT_ARTISTS:
105 return ADDON_SCRAPER_ARTISTS;
106 case CONTENT_MOVIES:
107 return ADDON_SCRAPER_MOVIES;
108 case CONTENT_MUSICVIDEOS:
109 return ADDON_SCRAPER_MUSICVIDEOS;
110 case CONTENT_TVSHOWS:
111 return ADDON_SCRAPER_TVSHOWS;
112 default:
113 return ADDON_UNKNOWN;
114 }
115}
116
117// if the XML root is <error>, throw CScraperError with enclosed <title>/<message> values
118static void CheckScraperError(const TiXmlElement *pxeRoot)
119{
120 if (!pxeRoot || stricmp(pxeRoot->Value(), "error"))
121 return;
122 std::string sTitle;
123 std::string sMessage;
124 XMLUtils::GetString(pxeRoot, "title", sTitle);
125 XMLUtils::GetString(pxeRoot, "message", sMessage);
126 throw CScraperError(sTitle, sMessage);
127}
128
129CScraper::CScraper(const cp_extension_t *ext) : CAddon(ext), m_fLoaded(false)
130{
131 if (ext)
132 {
133 m_language = CAddonMgr::Get().GetExtValue(ext->configuration, "@language");
134 m_requiressettings = CAddonMgr::Get().GetExtValue(ext->configuration,"@requiressettings") == "true";
135 std::string persistence = CAddonMgr::Get().GetExtValue(ext->configuration, "@cachepersistence");
136 if (!persistence.empty())
137 m_persistence.SetFromTimeString(persistence);
138 }
139 switch (Type())
140 {
141 case ADDON_SCRAPER_ALBUMS:
142 m_pathContent = CONTENT_ALBUMS;
143 break;
144 case ADDON_SCRAPER_ARTISTS:
145 m_pathContent = CONTENT_ARTISTS;
146 break;
147 case ADDON_SCRAPER_MOVIES:
148 m_pathContent = CONTENT_MOVIES;
149 break;
150 case ADDON_SCRAPER_MUSICVIDEOS:
151 m_pathContent = CONTENT_MUSICVIDEOS;
152 break;
153 case ADDON_SCRAPER_TVSHOWS:
154 m_pathContent = CONTENT_TVSHOWS;
155 break;
156 default:
157 m_pathContent = CONTENT_NONE;
158 break;
159 }
160}
161
162AddonPtr CScraper::Clone() const
163{
164 return AddonPtr(new CScraper(*this));
165}
166
167CScraper::CScraper(const CScraper &rhs)
168 : CAddon(rhs), m_fLoaded(false),
169 m_language(rhs.m_language),
170 m_requiressettings(rhs.m_requiressettings),
171 m_persistence(rhs.m_persistence),
172 m_pathContent(rhs.m_pathContent)
173{
174}
175
176bool CScraper::Supports(const CONTENT_TYPE &content) const
177{
178 return Type() == ScraperTypeFromContent(content);
179}
180
181bool CScraper::SetPathSettings(CONTENT_TYPE content, const std::string& xml)
182{
183 m_pathContent = content;
184 if (!LoadSettings())
185 return false;
186
187 if (xml.empty())
188 return true;
189
190 CXBMCTinyXML doc;
191 doc.Parse(xml);
192 m_userSettingsLoaded = SettingsFromXML(doc);
193
194 return m_userSettingsLoaded;
195}
196
197std::string CScraper::GetPathSettings()
198{
199 if (!LoadSettings())
200 return "";
201
202 stringstream stream;
203 CXBMCTinyXML doc;
204 SettingsToXML(doc);
205 if (doc.RootElement())
206 stream << *doc.RootElement();
207
208 return stream.str();
209}
210
211void CScraper::ClearCache()
212{
213 std::string strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers");
214
215 // create scraper cache dir if needed
216 if (!CDirectory::Exists(strCachePath))
217 CDirectory::Create(strCachePath);
218
219 strCachePath = URIUtils::AddFileToFolder(strCachePath, ID());
220 URIUtils::AddSlashAtEnd(strCachePath);
221
222 if (CDirectory::Exists(strCachePath))
223 {
224 CFileItemList items;
225 CDirectory::GetDirectory(strCachePath,items);
226 for (int i=0;i<items.Size();++i)
227 {
228 // wipe cache
229 if (items[i]->m_dateTime + m_persistence <= CDateTime::GetCurrentDateTime())
230 CFile::Delete(items[i]->GetPath());
231 }
232 }
233 else
234 CDirectory::Create(strCachePath);
235}
236
237// returns a vector of strings: the first is the XML output by the function; the rest
238// is XML output by chained functions, possibly recursively
239// the CCurlFile object is passed in so that URL fetches can be canceled from other threads
240// throws CScraperError abort on internal failures (e.g., parse errors)
241vector<string> CScraper::Run(const std::string& function,
242 const CScraperUrl& scrURL,
243 CCurlFile& http,
244 const vector<string>* extras)
245{
246 if (!Load())
247 throw CScraperError();
248
249 std::string strXML = InternalRun(function,scrURL,http,extras);
250 if (strXML.empty())
251 {
252 if (function != "NfoUrl" && function != "ResolveIDToUrl")
253 CLog::Log(LOGERROR, "%s: Unable to parse web site",__FUNCTION__);
254 throw CScraperError();
255 }
256
257 CLog::Log(LOGDEBUG,"scraper: %s returned %s",function.c_str(),strXML.c_str());
258
259 CXBMCTinyXML doc;
260 /* all data was converted to UTF-8 before being processed by scraper */
261 doc.Parse(strXML, TIXML_ENCODING_UTF8);
262 if (!doc.RootElement())
263 {
264 CLog::Log(LOGERROR, "%s: Unable to parse XML",__FUNCTION__);
265 throw CScraperError();
266 }
267
268 vector<string> result;
269 result.push_back(strXML);
270 TiXmlElement* xchain = doc.RootElement()->FirstChildElement();
271 // skip children of the root element until <url> or <chain>
272 while (xchain && strcmp(xchain->Value(),"url") && strcmp(xchain->Value(),"chain"))
273 xchain = xchain->NextSiblingElement();
274 while (xchain)
275 {
276 // <chain|url function="...">param</>
277 const char* szFunction = xchain->Attribute("function");
278 if (szFunction)
279 {
280 CScraperUrl scrURL2;
281 vector<string> extras;
282 // for <chain>, pass the contained text as a parameter; for <url>, as URL content
283 if (strcmp(xchain->Value(),"chain")==0)
284 {
285 if (xchain->FirstChild())
286 extras.push_back(xchain->FirstChild()->Value());
287 }
288 else
289 scrURL2.ParseElement(xchain);
290 // Fix for empty chains. $$1 would still contain the
291 // previous value as there is no child of the xml node.
292 // since $$1 will always either contain the data from an
293 // url or the parameters to a chain, we can safely clear it here
294 // to fix this issue
295 m_parser.m_param[0].clear();
296 vector<string> result2 = RunNoThrow(szFunction,scrURL2,http,&extras);
297 result.insert(result.end(),result2.begin(),result2.end());
298 }
299 xchain = xchain->NextSiblingElement();
300 // continue to skip past non-<url> or <chain> elements
301 while (xchain && strcmp(xchain->Value(),"url") && strcmp(xchain->Value(),"chain"))
302 xchain = xchain->NextSiblingElement();
303 }
304
305 return result;
306}
307
308// just like Run, but returns an empty list instead of throwing in case of error
309// don't use in new code; errors should be handled appropriately
310vector<string> CScraper::RunNoThrow(const std::string& function,
311 const CScraperUrl& url,
312 XFILE::CCurlFile& http,
313 const vector<string>* extras)
314{
315 vector<string> vcs;
316 try
317 {
318 vcs = Run(function, url, http, extras);
319 }
320 catch (const CScraperError &sce)
321 {
322 assert(sce.FAborted()); // the only kind we should get
323 }
324 return vcs;
325}
326
327std::string CScraper::InternalRun(const std::string& function,
328 const CScraperUrl& scrURL,
329 CCurlFile& http,
330 const vector<string>* extras)
331{
332 // walk the list of input URLs and fetch each into parser parameters
333 unsigned int i;
334 for (i=0;i<scrURL.m_url.size();++i)
335 {
336 if (!CScraperUrl::Get(scrURL.m_url[i],m_parser.m_param[i],http,ID()) || m_parser.m_param[i].size() == 0)
337 return "";
338 }
339 // put the 'extra' parameterts into the parser parameter list too
340 if (extras)
341 {
342 for (unsigned int j=0;j<extras->size();++j)
343 m_parser.m_param[j+i] = (*extras)[j];
344 }
345
346 return m_parser.Parse(function,this);
347}
348
349bool CScraper::Load()
350{
351 if (m_fLoaded)
352 return true;
353
354 bool result=m_parser.Load(LibPath());
355 if (result)
356 {
357 // TODO: this routine assumes that deps are a single level, and assumes the dep is installed.
358 // 1. Does it make sense to have recursive dependencies?
359 // 2. Should we be checking the dep versions or do we assume it is ok?
360 ADDONDEPS deps = GetDeps();
361 ADDONDEPS::iterator itr = deps.begin();
362 while (itr != deps.end())
363 {
364 if (itr->first == "xbmc.metadata")
365 {
366 ++itr;
367 continue;
368 }
369 AddonPtr dep;
370
371 bool bOptional = itr->second.second;
372
373 if (CAddonMgr::Get().GetAddon((*itr).first, dep))
374 {
375 CXBMCTinyXML doc;
376 if (dep->Type() == ADDON_SCRAPER_LIBRARY && doc.LoadFile(dep->LibPath()))
377 m_parser.AddDocument(&doc);
378 }
379 else
380 {
381 if (!bOptional)
382 {
383 result = false;
384 break;
385 }
386 }
387 ++itr;
388 }
389 }
390
391 if (!result)
392 CLog::Log(LOGWARNING, "failed to load scraper XML from %s", LibPath().c_str());
393 return m_fLoaded = result;
394}
395
396bool CScraper::IsInUse() const
397{
398 if (Supports(CONTENT_ALBUMS) || Supports(CONTENT_ARTISTS))
399 { // music scraper
400 CMusicDatabase db;
401 if (db.Open() && db.ScraperInUse(ID()))
402 return true;
403 }
404 else
405 { // video scraper
406 CVideoDatabase db;
407 if (db.Open() && db.ScraperInUse(ID()))
408 return true;
409 }
410 return false;
411}
412
413bool CScraper::IsNoop()
414{
415 if (!Load())
416 throw CScraperError();
417
418 return m_parser.IsNoop();
419}
420
421// pass in contents of .nfo file; returns URL (possibly empty if none found)
422// and may populate strId, or throws CScraperError on error
423CScraperUrl CScraper::NfoUrl(const std::string &sNfoContent)
424{
425 CScraperUrl scurlRet;
426
427 if (IsNoop())
428 return scurlRet;
429
430 // scraper function takes contents of .nfo file, returns XML (see below)
431 vector<string> vcsIn;
432 vcsIn.push_back(sNfoContent);
433 CScraperUrl scurl;
434 CCurlFile fcurl;
435 vector<string> vcsOut = Run("NfoUrl", scurl, fcurl, &vcsIn);
436 if (vcsOut.empty() || vcsOut[0].empty())
437 return scurlRet;
438 if (vcsOut.size() > 1)
439 CLog::Log(LOGWARNING, "%s: scraper returned multiple results; using first", __FUNCTION__);
440
441 // parse returned XML: either <error> element on error, blank on failure,
442 // or <url>...</url> or <url>...</url><id>...</id> on success
443 for (unsigned int i=0; i < vcsOut.size(); ++i)
444 {
445 CXBMCTinyXML doc;
446 doc.Parse(vcsOut[i], TIXML_ENCODING_UTF8);
447 CheckScraperError(doc.RootElement());
448
449 if (doc.RootElement())
450 {
451 /*
452 NOTE: Scrapers might return invalid xml with some loose
453 elements (eg. '<url>http://some.url</url><id>123</id>').
454 Since XMLUtils::GetString() is assuming well formed xml
455 with start and end-tags we're not able to use it.
456 Check for the desired Elements instead.
457 */
458 TiXmlElement* pxeUrl=NULL;
459 TiXmlElement* pId=NULL;
460 if (!strcmp(doc.RootElement()->Value(),"details"))
461 {
462 pxeUrl = doc.RootElement()->FirstChildElement("url");
463 pId = doc.RootElement()->FirstChildElement("id");
464 }
465 else
466 {
467 pId = doc.FirstChildElement("id");
468 pxeUrl = doc.FirstChildElement("url");
469 }
470 if (pId && pId->FirstChild())
471 scurlRet.strId = pId->FirstChild()->Value();
472
473 if (pxeUrl && pxeUrl->Attribute("function"))
474 continue;
475
476 if (pxeUrl)
477 scurlRet.ParseElement(pxeUrl);
478 else if (!strcmp(doc.RootElement()->Value(), "url"))
479 scurlRet.ParseElement(doc.RootElement());
480 else
481 continue;
482 break;
483 }
484 }
485 return scurlRet;
486}
487
488CScraperUrl CScraper::ResolveIDToUrl(const std::string& externalID)
489{
490 CScraperUrl scurlRet;
491
492 // scraper function takes an external ID, returns XML (see below)
493 vector<string> vcsIn;
494 vcsIn.push_back(externalID);
495 CScraperUrl scurl;
496 CCurlFile fcurl;
497 vector<string> vcsOut = Run("ResolveIDToUrl", scurl, fcurl, &vcsIn);
498 if (vcsOut.empty() || vcsOut[0].empty())
499 return scurlRet;
500 if (vcsOut.size() > 1)
501 CLog::Log(LOGWARNING, "%s: scraper returned multiple results; using first", __FUNCTION__);
502
503 // parse returned XML: either <error> element on error, blank on failure,
504 // or <url>...</url> or <url>...</url><id>...</id> on success
505 for (unsigned int i=0; i < vcsOut.size(); ++i)
506 {
507 CXBMCTinyXML doc;
508 doc.Parse(vcsOut[i], TIXML_ENCODING_UTF8);
509 CheckScraperError(doc.RootElement());
510
511 if (doc.RootElement())
512 {
513 /*
514 NOTE: Scrapers might return invalid xml with some loose
515 elements (eg. '<url>http://some.url</url><id>123</id>').
516 Since XMLUtils::GetString() is assuming well formed xml
517 with start and end-tags we're not able to use it.
518 Check for the desired Elements instead.
519 */
520 TiXmlElement* pxeUrl=NULL;
521 TiXmlElement* pId=NULL;
522 if (!strcmp(doc.RootElement()->Value(),"details"))
523 {
524 pxeUrl = doc.RootElement()->FirstChildElement("url");
525 pId = doc.RootElement()->FirstChildElement("id");
526 }
527 else
528 {
529 pId = doc.FirstChildElement("id");
530 pxeUrl = doc.FirstChildElement("url");
531 }
532 if (pId && pId->FirstChild())
533 scurlRet.strId = pId->FirstChild()->Value();
534
535 if (pxeUrl && pxeUrl->Attribute("function"))
536 continue;
537
538 if (pxeUrl)
539 scurlRet.ParseElement(pxeUrl);
540 else if (!strcmp(doc.RootElement()->Value(), "url"))
541 scurlRet.ParseElement(doc.RootElement());
542 else
543 continue;
544 break;
545 }
546 }
547 return scurlRet;
548}
549
550static bool RelevanceSortFunction(const CScraperUrl &left, const CScraperUrl &right)
551{
552 return left.relevance > right.relevance;
553}
554
555// fetch list of matching movies sorted by relevance (may be empty);
556// throws CScraperError on error; first called with fFirst set, then unset if first try fails
557std::vector<CScraperUrl> CScraper::FindMovie(XFILE::CCurlFile &fcurl, const std::string &sMovie,
558 bool fFirst)
559{
560 // prepare parameters for URL creation
561 std::string sTitle, sTitleYear, sYear;
562 CUtil::CleanString(sMovie, sTitle, sTitleYear, sYear, true/*fRemoveExt*/, fFirst);
563
564 CLog::Log(LOGDEBUG, "%s: Searching for '%s' using %s scraper "
565 "(path: '%s', content: '%s', version: '%s')", __FUNCTION__, sTitle.c_str(),
566 Name().c_str(), Path().c_str(),
567 ADDON::TranslateContent(Content()).c_str(), Version().asString().c_str());
568
569 std::vector<CScraperUrl> vcscurl;
570 if (IsNoop())
571 return vcscurl;
572
573 if (!fFirst)
574 StringUtils::Replace(sTitle, '-',' ');
575
576 vector<string> vcsIn(1);
577 g_charsetConverter.utf8To(SearchStringEncoding(), sTitle, vcsIn[0]);
578 vcsIn[0] = CURL::Encode(vcsIn[0]);
579 if (fFirst && !sYear.empty())
580 vcsIn.push_back(sYear);
581
582 // request a search URL from the title/filename/etc.
583 CScraperUrl scurl;
584 vector<string> vcsOut = Run("CreateSearchUrl", scurl, fcurl, &vcsIn);
585 if (vcsOut.empty())
586 {
587 CLog::Log(LOGDEBUG, "%s: CreateSearchUrl failed", __FUNCTION__);
588 throw CScraperError();
589 }
590 scurl.ParseString(vcsOut[0]);
591
592 // do the search, and parse the result into a list
593 vcsIn.clear();
594 vcsIn.push_back(scurl.m_url[0].m_url);
595 vcsOut = Run("GetSearchResults", scurl, fcurl, &vcsIn);
596
597 bool fSort(true);
598 std::set<std::string> stsDupeCheck;
599 bool fResults(false);
600 for (vector<string>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
601 {
602 CXBMCTinyXML doc;
603 doc.Parse(*i, TIXML_ENCODING_UTF8);
604 if (!doc.RootElement())
605 {
606 CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
607 continue; // might have more valid results later
608 }
609
610 CheckScraperError(doc.RootElement());
611
612 TiXmlHandle xhDoc(&doc);
613 TiXmlHandle xhResults = xhDoc.FirstChild("results");
614 if (!xhResults.Element())
615 continue;
616 fResults = true; // even if empty
617
618 // we need to sort if returned results don't specify 'sorted="yes"'
619 if (fSort)
620 {
621 const char *sorted = xhResults.Element()->Attribute("sorted");
622 if (sorted != NULL)
623 fSort = !StringUtils::EqualsNoCase(sorted, "yes");
624 }
625
626 for (TiXmlElement *pxeMovie = xhResults.FirstChild("entity").Element();
627 pxeMovie; pxeMovie = pxeMovie->NextSiblingElement())
628 {
629 CScraperUrl scurlMovie;
630 TiXmlNode *pxnTitle = pxeMovie->FirstChild("title");
631 TiXmlElement *pxeLink = pxeMovie->FirstChildElement("url");
632 if (pxnTitle && pxnTitle->FirstChild() && pxeLink && pxeLink->FirstChild())
633 {
634 scurlMovie.strTitle = pxnTitle->FirstChild()->Value();
635 XMLUtils::GetString(pxeMovie, "id", scurlMovie.strId);
636
637 for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
638 scurlMovie.ParseElement(pxeLink);
639
640 // calculate the relavance of this hit
641 std::string sCompareTitle = scurlMovie.strTitle;
642 StringUtils::ToLower(sCompareTitle);
643 std::string sMatchTitle = sTitle;
644 StringUtils::ToLower(sMatchTitle);
645
646 /*
647 * Identify the best match by performing a fuzzy string compare on the search term and
648 * the result. Additionally, use the year (if available) to further refine the best match.
649 * An exact match scores 1, a match off by a year scores 0.5 (release dates can vary between
650 * countries), otherwise it scores 0.
651 */
652 std::string sCompareYear;
653 XMLUtils::GetString(pxeMovie, "year", sCompareYear);
654
655 double yearScore = 0;
656 if (!sYear.empty() && !sCompareYear.empty())
657 yearScore = std::max(0.0, 1-0.5*abs(atoi(sYear.c_str())-atoi(sCompareYear.c_str())));
658
659 scurlMovie.relevance = fstrcmp(sMatchTitle.c_str(), sCompareTitle.c_str(), 0.0) + yearScore;
660
661 // reconstruct a title for the user
662 if (!sCompareYear.empty())
663 scurlMovie.strTitle += StringUtils::Format(" (%s)", sCompareYear.c_str());
664
665 std::string sLanguage;
666 if (XMLUtils::GetString(pxeMovie, "language", sLanguage) && !sLanguage.empty())
667 scurlMovie.strTitle += StringUtils::Format(" (%s)", sLanguage.c_str());
668
669 // filter for dupes from naughty scrapers
670 if (stsDupeCheck.insert(scurlMovie.m_url[0].m_url + " " + scurlMovie.strTitle).second)
671 vcscurl.push_back(scurlMovie);
672 }
673 }
674 }
675
676 if (!fResults)
677 throw CScraperError(); // scraper aborted
678
679 if (fSort)
680 std::stable_sort(vcscurl.begin(), vcscurl.end(), RelevanceSortFunction);
681
682 return vcscurl;
683}
684
685// find album by artist, using fcurl for web fetches
686// returns a list of albums (empty if no match or failure)
687std::vector<CMusicAlbumInfo> CScraper::FindAlbum(CCurlFile &fcurl, const std::string &sAlbum,
688 const std::string &sArtist)
689{
690 CLog::Log(LOGDEBUG, "%s: Searching for '%s - %s' using %s scraper "
691 "(path: '%s', content: '%s', version: '%s')", __FUNCTION__, sArtist.c_str(),
692 sAlbum.c_str(), Name().c_str(), Path().c_str(),
693 ADDON::TranslateContent(Content()).c_str(), Version().asString().c_str());
694
695 std::vector<CMusicAlbumInfo> vcali;
696 if (IsNoop())
697 return vcali;
698
699 // scraper function is given the album and artist as parameters and
700 // returns an XML <url> element parseable by CScraperUrl
701 std::vector<string> extras(2);
702 g_charsetConverter.utf8To(SearchStringEncoding(), sAlbum, extras[0]);
703 g_charsetConverter.utf8To(SearchStringEncoding(), sArtist, extras[1]);
704 extras[0] = CURL::Encode(extras[0]);
705 extras[1] = CURL::Encode(extras[1]);
706 CScraperUrl scurl;
707 vector<string> vcsOut = RunNoThrow("CreateAlbumSearchUrl", scurl, fcurl, &extras);
708 if (vcsOut.size() > 1)
709 CLog::Log(LOGWARNING, "%s: scraper returned multiple results; using first", __FUNCTION__);
710
711 if (vcsOut.empty() || vcsOut[0].empty())
712 return vcali;
713 scurl.ParseString(vcsOut[0]);
714
715 // the next function is passed the contents of the returned URL, and returns
716 // an empty string on failure; on success, returns XML matches in the form:
717 // <results>
718 // <entity>
719 // <title>...</title>
720 // <url>...</url> (with the usual CScraperUrl decorations like post or spoof)
721 // <artist>...</artist>
722 // <year>...</year>
723 // <relevance [scale="..."]>...</relevance> (scale defaults to 1; score is divided by it)
724 // </entity>
725 // ...
726 // </results>
727 vcsOut = RunNoThrow("GetAlbumSearchResults", scurl, fcurl);
728
729 // parse the returned XML into a vector of album objects
730 for (vector<string>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
731 {
732 CXBMCTinyXML doc;
733 doc.Parse(*i, TIXML_ENCODING_UTF8);
734 TiXmlHandle xhDoc(&doc);
735
736 for (TiXmlElement* pxeAlbum = xhDoc.FirstChild("results").FirstChild("entity").Element();
737 pxeAlbum; pxeAlbum = pxeAlbum->NextSiblingElement())
738 {
739 std::string sTitle;
740 if (XMLUtils::GetString(pxeAlbum, "title", sTitle) && !sTitle.empty())
741 {
742 std::string sArtist;
743 std::string sAlbumName;
744 if (XMLUtils::GetString(pxeAlbum, "artist", sArtist) && !sArtist.empty())
745 sAlbumName = StringUtils::Format("%s - %s", sArtist.c_str(), sTitle.c_str());
746 else
747 sAlbumName = sTitle;
748
749 std::string sYear;
750 if (XMLUtils::GetString(pxeAlbum, "year", sYear) && !sYear.empty())
751 sAlbumName = StringUtils::Format("%s (%s)", sAlbumName.c_str(), sYear.c_str());
752
753 // if no URL is provided, use the URL we got back from CreateAlbumSearchUrl
754 // (e.g., in case we only got one result back and were sent to the detail page)
755 TiXmlElement* pxeLink = pxeAlbum->FirstChildElement("url");
756 CScraperUrl scurlAlbum;
757 if (!pxeLink)
758 scurlAlbum.ParseString(scurl.m_xml);
759 for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
760 scurlAlbum.ParseElement(pxeLink);
761
762 if (!scurlAlbum.m_url.size())
763 continue;
764
765 CMusicAlbumInfo ali(sTitle, sArtist, sAlbumName, scurlAlbum);
766
767 TiXmlElement* pxeRel = pxeAlbum->FirstChildElement("relevance");
768 if (pxeRel && pxeRel->FirstChild())
769 {
770 const char* szScale = pxeRel->Attribute("scale");
771 float flScale = szScale ? float(atof(szScale)) : 1;
772 ali.SetRelevance(float(atof(pxeRel->FirstChild()->Value())) / flScale);
773 }
774
775 vcali.push_back(ali);
776 }
777 }
778 }
779 return vcali;
780}
781
782// find artist, using fcurl for web fetches
783// returns a list of artists (empty if no match or failure)
784std::vector<CMusicArtistInfo> CScraper::FindArtist(CCurlFile &fcurl,
785 const std::string &sArtist)
786{
787 CLog::Log(LOGDEBUG, "%s: Searching for '%s' using %s scraper "
788 "(file: '%s', content: '%s', version: '%s')", __FUNCTION__, sArtist.c_str(),
789 Name().c_str(), Path().c_str(),
790 ADDON::TranslateContent(Content()).c_str(), Version().asString().c_str());
791
792 std::vector<CMusicArtistInfo> vcari;
793 if (IsNoop())
794 return vcari;
795
796 // scraper function is given the artist as parameter and
797 // returns an XML <url> element parseable by CScraperUrl
798 std::vector<string> extras(1);
799 g_charsetConverter.utf8To(SearchStringEncoding(), sArtist, extras[0]);
800 extras[0] = CURL::Encode(extras[0]);
801 CScraperUrl scurl;
802 vector<string> vcsOut = RunNoThrow("CreateArtistSearchUrl", scurl, fcurl, &extras);
803
804 if (vcsOut.empty() || vcsOut[0].empty())
805 return vcari;
806 scurl.ParseString(vcsOut[0]);
807
808 // the next function is passed the contents of the returned URL, and returns
809 // an empty string on failure; on success, returns XML matches in the form:
810 // <results>
811 // <entity>
812 // <title>...</title>
813 // <year>...</year>
814 // <genre>...</genre>
815 // <url>...</url> (with the usual CScraperUrl decorations like post or spoof)
816 // </entity>
817 // ...
818 // </results>
819 vcsOut = RunNoThrow("GetArtistSearchResults", scurl, fcurl);
820
821 // parse the returned XML into a vector of artist objects
822 for (vector<string>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
823 {
824 CXBMCTinyXML doc;
825 doc.Parse(*i, TIXML_ENCODING_UTF8);
826 if (!doc.RootElement())
827 {
828 CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
829 return vcari;
830 }
831 TiXmlHandle xhDoc(&doc);
832 for (TiXmlElement* pxeArtist = xhDoc.FirstChild("results").FirstChild("entity").Element();
833 pxeArtist; pxeArtist = pxeArtist->NextSiblingElement())
834 {
835 TiXmlNode* pxnTitle = pxeArtist->FirstChild("title");
836 if (pxnTitle && pxnTitle->FirstChild())
837 {
838 CScraperUrl scurlArtist;
839
840 TiXmlElement* pxeLink = pxeArtist->FirstChildElement("url");
841 if (!pxeLink)
842 scurlArtist.ParseString(scurl.m_xml);
843 for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
844 scurlArtist.ParseElement(pxeLink);
845
846 if (!scurlArtist.m_url.size())
847 continue;
848
849 CMusicArtistInfo ari(pxnTitle->FirstChild()->Value(), scurlArtist);
850 std::string genre;
851 XMLUtils::GetString(pxeArtist, "genre", genre);
852 if (!genre.empty())
853 ari.GetArtist().genre = StringUtils::Split(genre, g_advancedSettings.m_musicItemSeparator);
854 XMLUtils::GetString(pxeArtist, "year", ari.GetArtist().strBorn);
855
856 vcari.push_back(ari);
857 }
858 }
859 }
860 return vcari;
861}
862
863// fetch list of episodes from URL (from video database)
864EPISODELIST CScraper::GetEpisodeList(XFILE::CCurlFile &fcurl, const CScraperUrl &scurl)
865{
866 EPISODELIST vcep;
867 if (scurl.m_url.empty())
868 return vcep;
869
870 CLog::Log(LOGDEBUG, "%s: Searching '%s' using %s scraper "
871 "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
872 scurl.m_url[0].m_url.c_str(), Name().c_str(), Path().c_str(),
873 ADDON::TranslateContent(Content()).c_str(), Version().asString().c_str());
874
875 vector<string> vcsIn;
876 vcsIn.push_back(scurl.m_url[0].m_url);
877 vector<string> vcsOut = RunNoThrow("GetEpisodeList", scurl, fcurl, &vcsIn);
878
879 // parse the XML response
880 for (vector<string>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
881 {
882 CXBMCTinyXML doc;
883 doc.Parse(*i);
884 if (!doc.RootElement())
885 {
886 CLog::Log(LOGERROR, "%s: Unable to parse XML",__FUNCTION__);
887 continue;
888 }
889
890 TiXmlHandle xhDoc(&doc);
891 for (TiXmlElement *pxeMovie = xhDoc.FirstChild("episodeguide").FirstChild("episode").
892 Element(); pxeMovie; pxeMovie = pxeMovie->NextSiblingElement())
893 {
894 EPISODE ep;
895 TiXmlElement *pxeLink = pxeMovie->FirstChildElement("url");
896 std::string strEpNum;
897 if (pxeLink && XMLUtils::GetInt(pxeMovie, "season", ep.iSeason) &&
898 XMLUtils::GetString(pxeMovie, "epnum", strEpNum) && !strEpNum.empty())
899 {
900 CScraperUrl &scurlEp(ep.cScraperUrl);
901 size_t dot = strEpNum.find(".");
902 ep.iEpisode = atoi(strEpNum.c_str());
903 ep.iSubepisode = (dot != std::string::npos) ? atoi(strEpNum.substr(dot + 1).c_str()) : 0;
904 if (!XMLUtils::GetString(pxeMovie, "title", scurlEp.strTitle) || scurlEp.strTitle.empty() )
905 scurlEp.strTitle = g_localizeStrings.Get(416);
906 XMLUtils::GetString(pxeMovie, "id", scurlEp.strId);
907
908 for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
909 scurlEp.ParseElement(pxeLink);
910
911 // date must be the format of yyyy-mm-dd
912 ep.cDate.SetValid(FALSE);
913 std::string sDate;
914 if (XMLUtils::GetString(pxeMovie, "aired", sDate) && sDate.length() == 10)
915 {
916 tm tm;
917 if (strptime(sDate.c_str(), "%Y-%m-%d", &tm))
918 ep.cDate.SetDate(1900+tm.tm_year, tm.tm_mon + 1, tm.tm_mday);
919 }
920 vcep.push_back(ep);
921 }
922 }
923 }
924
925 return vcep;
926}
927
928// takes URL; returns true and populates video details on success, false otherwise
929bool CScraper::GetVideoDetails(XFILE::CCurlFile &fcurl, const CScraperUrl &scurl,
930 bool fMovie/*else episode*/, CVideoInfoTag &video)
931{
932 CLog::Log(LOGDEBUG, "%s: Reading %s '%s' using %s scraper "
933 "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
934 fMovie ? MediaTypeMovie : MediaTypeEpisode, scurl.m_url[0].m_url.c_str(), Name().c_str(), Path().c_str(),
935 ADDON::TranslateContent(Content()).c_str(), Version().asString().c_str());
936
937 video.Reset();
938 std::string sFunc = fMovie ? "GetDetails" : "GetEpisodeDetails";
939 vector<string> vcsIn;
940 vcsIn.push_back(scurl.strId);
941 vcsIn.push_back(scurl.m_url[0].m_url);
942 vector<string> vcsOut = RunNoThrow(sFunc, scurl, fcurl, &vcsIn);
943
944 // parse XML output
945 bool fRet(false);
946 for (vector<string>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
947 {
948 CXBMCTinyXML doc;
949 doc.Parse(*i, TIXML_ENCODING_UTF8);
950 if (!doc.RootElement())
951 {
952 CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
953 continue;
954 }
955
956 TiXmlHandle xhDoc(&doc);
957 TiXmlElement *pxeDetails = xhDoc.FirstChild("details").Element();
958 if (!pxeDetails)
959 {
960 CLog::Log(LOGERROR, "%s: Invalid XML file (want <details>)", __FUNCTION__);
961 continue;
962 }
963 video.Load(pxeDetails, true/*fChain*/);
964 fRet = true; // but don't exit in case of chaining
965 }
966 return fRet;
967}
968
969// takes a URL; returns true and populates album on success, false otherwise
970bool CScraper::GetAlbumDetails(CCurlFile &fcurl, const CScraperUrl &scurl, CAlbum &album)
971{
972 CLog::Log(LOGDEBUG, "%s: Reading '%s' using %s scraper "
973 "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
974 scurl.m_url[0].m_url.c_str(), Name().c_str(), Path().c_str(),
975 ADDON::TranslateContent(Content()).c_str(), Version().asString().c_str());
976
977 vector<string> vcsOut = RunNoThrow("GetAlbumDetails", scurl, fcurl);
978
979 // parse the returned XML into an album object (see CAlbum::Load for details)
980 bool fRet(false);
981 for (vector<string>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
982 {
983 CXBMCTinyXML doc;
984 doc.Parse(*i, TIXML_ENCODING_UTF8);
985 if (!doc.RootElement())
986 {
987 CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
988 return false;
989 }
990 fRet = album.Load(doc.RootElement(), i != vcsOut.begin());
991 }
992 return fRet;
993}
994
995// takes a URL (one returned from FindArtist), the original search string, and
996// returns true and populates artist on success, false on failure
997bool CScraper::GetArtistDetails(CCurlFile &fcurl, const CScraperUrl &scurl,
998 const std::string &sSearch, CArtist &artist)
999{
1000 if (!scurl.m_url.size())
1001 return false;
1002
1003 CLog::Log(LOGDEBUG, "%s: Reading '%s' ('%s') using %s scraper "
1004 "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
1005 scurl.m_url[0].m_url.c_str(), sSearch.c_str(), Name().c_str(), Path().c_str(),
1006 ADDON::TranslateContent(Content()).c_str(), Version().asString().c_str());
1007
1008 // pass in the original search string for chaining to search other sites
1009 vector<string> vcIn;
1010 vcIn.push_back(sSearch);
1011 vcIn[0] = CURL::Encode(vcIn[0]);
1012
1013 vector<string> vcsOut = RunNoThrow("GetArtistDetails", scurl, fcurl, &vcIn);
1014
1015 // ok, now parse the xml file
1016 bool fRet(false);
1017 for (vector<string>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
1018 {
1019 CXBMCTinyXML doc;
1020 doc.Parse(*i, TIXML_ENCODING_UTF8);
1021 if (!doc.RootElement())
1022 {
1023 CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
1024 return false;
1025 }
1026
1027 fRet = artist.Load(doc.RootElement(), i != vcsOut.begin());
1028 }
1029 return fRet;
1030}
1031
1032}
1033