diff options
Diffstat (limited to 'xbmc/utils/ScraperUrl.cpp')
| -rw-r--r-- | xbmc/utils/ScraperUrl.cpp | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperUrl.cpp b/xbmc/utils/ScraperUrl.cpp new file mode 100644 index 0000000..f242a40 --- /dev/null +++ b/xbmc/utils/ScraperUrl.cpp | |||
| @@ -0,0 +1,432 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2005-2018 Team Kodi | ||
| 3 | * This file is part of Kodi - https://kodi.tv | ||
| 4 | * | ||
| 5 | * SPDX-License-Identifier: GPL-2.0-or-later | ||
| 6 | * See LICENSES/README.md for more information. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include "ScraperUrl.h" | ||
| 10 | |||
| 11 | #include "CharsetConverter.h" | ||
| 12 | #include "ServiceBroker.h" | ||
| 13 | #include "URIUtils.h" | ||
| 14 | #include "URL.h" | ||
| 15 | #include "XMLUtils.h" | ||
| 16 | #include "filesystem/CurlFile.h" | ||
| 17 | #include "filesystem/ZipFile.h" | ||
| 18 | #include "settings/AdvancedSettings.h" | ||
| 19 | #include "settings/SettingsComponent.h" | ||
| 20 | #include "utils/CharsetDetection.h" | ||
| 21 | #include "utils/Mime.h" | ||
| 22 | #include "utils/StringUtils.h" | ||
| 23 | #include "utils/XBMCTinyXML.h" | ||
| 24 | #include "utils/log.h" | ||
| 25 | |||
| 26 | #include <algorithm> | ||
| 27 | #include <cstring> | ||
| 28 | #include <sstream> | ||
| 29 | |||
| 30 | CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false) | ||
| 31 | { | ||
| 32 | } | ||
| 33 | |||
| 34 | CScraperUrl::CScraperUrl(std::string strUrl) : CScraperUrl() | ||
| 35 | { | ||
| 36 | ParseFromData(std::move(strUrl)); | ||
| 37 | } | ||
| 38 | |||
| 39 | CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl() | ||
| 40 | { | ||
| 41 | ParseAndAppendUrl(element); | ||
| 42 | } | ||
| 43 | |||
| 44 | CScraperUrl::~CScraperUrl() = default; | ||
| 45 | |||
| 46 | void CScraperUrl::Clear() | ||
| 47 | { | ||
| 48 | m_urls.clear(); | ||
| 49 | m_data.clear(); | ||
| 50 | m_relevance = 0.0; | ||
| 51 | m_parsed = false; | ||
| 52 | } | ||
| 53 | |||
| 54 | void CScraperUrl::SetData(std::string data) | ||
| 55 | { | ||
| 56 | m_data = std::move(data); | ||
| 57 | m_parsed = false; | ||
| 58 | } | ||
| 59 | |||
| 60 | const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const | ||
| 61 | { | ||
| 62 | const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) { | ||
| 63 | return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type); | ||
| 64 | }); | ||
| 65 | if (url != m_urls.end()) | ||
| 66 | return *url; | ||
| 67 | |||
| 68 | return SUrlEntry(); | ||
| 69 | } | ||
| 70 | |||
| 71 | const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const | ||
| 72 | { | ||
| 73 | const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) { | ||
| 74 | return url.m_type == UrlType::Season && url.m_season == season && | ||
| 75 | (type.empty() || type == "thumb" || url.m_aspect == type); | ||
| 76 | }); | ||
| 77 | if (url != m_urls.end()) | ||
| 78 | return *url; | ||
| 79 | |||
| 80 | return SUrlEntry(); | ||
| 81 | } | ||
| 82 | |||
| 83 | unsigned int CScraperUrl::GetMaxSeasonUrl() const | ||
| 84 | { | ||
| 85 | unsigned int maxSeason = 0; | ||
| 86 | for (const auto& url : m_urls) | ||
| 87 | { | ||
| 88 | if (url.m_type == UrlType::Season && url.m_season > 0 && | ||
| 89 | static_cast<unsigned int>(url.m_season) > maxSeason) | ||
| 90 | maxSeason = url.m_season; | ||
| 91 | } | ||
| 92 | return maxSeason; | ||
| 93 | } | ||
| 94 | |||
| 95 | std::string CScraperUrl::GetFirstThumbUrl() const | ||
| 96 | { | ||
| 97 | if (m_urls.empty()) | ||
| 98 | return {}; | ||
| 99 | |||
| 100 | return GetThumbUrl(m_urls.front()); | ||
| 101 | } | ||
| 102 | |||
| 103 | void CScraperUrl::GetThumbUrls(std::vector<std::string>& thumbs, | ||
| 104 | const std::string& type, | ||
| 105 | int season, | ||
| 106 | bool unique) const | ||
| 107 | { | ||
| 108 | for (const auto& url : m_urls) | ||
| 109 | { | ||
| 110 | if (url.m_aspect == type || type.empty() || url.m_aspect.empty()) | ||
| 111 | { | ||
| 112 | if ((url.m_type == CScraperUrl::UrlType::General && season == -1) || | ||
| 113 | (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season)) | ||
| 114 | { | ||
| 115 | std::string thumbUrl = GetThumbUrl(url); | ||
| 116 | if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end()) | ||
| 117 | thumbs.push_back(thumbUrl); | ||
| 118 | } | ||
| 119 | } | ||
| 120 | } | ||
| 121 | } | ||
| 122 | |||
| 123 | bool CScraperUrl::Parse() | ||
| 124 | { | ||
| 125 | if (m_parsed) | ||
| 126 | return true; | ||
| 127 | |||
| 128 | auto dataToParse = m_data; | ||
| 129 | m_data.clear(); | ||
| 130 | return ParseFromData(std::move(dataToParse)); | ||
| 131 | } | ||
| 132 | |||
| 133 | bool CScraperUrl::ParseFromData(std::string data) | ||
| 134 | { | ||
| 135 | if (data.empty()) | ||
| 136 | return false; | ||
| 137 | |||
| 138 | CXBMCTinyXML doc; | ||
| 139 | /* strUrl is coming from internal sources (usually generated by scraper or from database) | ||
| 140 | * so strUrl is always in UTF-8 */ | ||
| 141 | doc.Parse(data, TIXML_ENCODING_UTF8); | ||
| 142 | |||
| 143 | auto pElement = doc.RootElement(); | ||
| 144 | if (pElement == nullptr) | ||
| 145 | { | ||
| 146 | m_urls.emplace_back(data); | ||
| 147 | m_data = data; | ||
| 148 | } | ||
| 149 | else | ||
| 150 | { | ||
| 151 | while (pElement != nullptr) | ||
| 152 | { | ||
| 153 | ParseAndAppendUrl(pElement); | ||
| 154 | pElement = pElement->NextSiblingElement(pElement->Value()); | ||
| 155 | } | ||
| 156 | } | ||
| 157 | |||
| 158 | m_parsed = true; | ||
| 159 | return true; | ||
| 160 | } | ||
| 161 | |||
| 162 | bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element) | ||
| 163 | { | ||
| 164 | if (element == nullptr || element->FirstChild() == nullptr || | ||
| 165 | element->FirstChild()->Value() == nullptr) | ||
| 166 | return false; | ||
| 167 | |||
| 168 | bool wasEmpty = m_data.empty(); | ||
| 169 | |||
| 170 | std::stringstream stream; | ||
| 171 | stream << *element; | ||
| 172 | m_data += stream.str(); | ||
| 173 | |||
| 174 | SUrlEntry url(element->FirstChild()->ValueStr()); | ||
| 175 | url.m_spoof = XMLUtils::GetAttribute(element, "spoof"); | ||
| 176 | |||
| 177 | const char* szPost = element->Attribute("post"); | ||
| 178 | if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0) | ||
| 179 | url.m_post = true; | ||
| 180 | else | ||
| 181 | url.m_post = false; | ||
| 182 | |||
| 183 | const char* szIsGz = element->Attribute("gzip"); | ||
| 184 | if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0) | ||
| 185 | url.m_isgz = true; | ||
| 186 | else | ||
| 187 | url.m_isgz = false; | ||
| 188 | |||
| 189 | url.m_cache = XMLUtils::GetAttribute(element, "cache"); | ||
| 190 | |||
| 191 | const char* szType = element->Attribute("type"); | ||
| 192 | if (szType && StringUtils::CompareNoCase(szType, "season") == 0) | ||
| 193 | { | ||
| 194 | url.m_type = UrlType::Season; | ||
| 195 | const char* szSeason = element->Attribute("season"); | ||
| 196 | if (szSeason) | ||
| 197 | url.m_season = atoi(szSeason); | ||
| 198 | } | ||
| 199 | |||
| 200 | url.m_aspect = XMLUtils::GetAttribute(element, "aspect"); | ||
| 201 | |||
| 202 | m_urls.push_back(url); | ||
| 203 | |||
| 204 | if (wasEmpty) | ||
| 205 | m_parsed = true; | ||
| 206 | |||
| 207 | return true; | ||
| 208 | } | ||
| 209 | |||
| 210 | // XML format is of strUrls is: | ||
| 211 | // <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto) | ||
| 212 | bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(std::string episodeGuide) | ||
| 213 | { | ||
| 214 | if (episodeGuide.empty()) | ||
| 215 | return false; | ||
| 216 | |||
| 217 | // ok, now parse the xml file | ||
| 218 | CXBMCTinyXML doc; | ||
| 219 | /* strUrls is coming from internal sources so strUrls is always in UTF-8 */ | ||
| 220 | doc.Parse(episodeGuide, TIXML_ENCODING_UTF8); | ||
| 221 | if (doc.RootElement() == nullptr) | ||
| 222 | return false; | ||
| 223 | |||
| 224 | bool wasEmpty = m_data.empty(); | ||
| 225 | |||
| 226 | TiXmlHandle docHandle(&doc); | ||
| 227 | auto link = docHandle.FirstChild("episodeguide").Element(); | ||
| 228 | if (link->FirstChildElement("url")) | ||
| 229 | { | ||
| 230 | for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url")) | ||
| 231 | ParseAndAppendUrl(link); | ||
| 232 | } | ||
| 233 | else if (link->FirstChild() && link->FirstChild()->Value()) | ||
| 234 | ParseAndAppendUrl(link); | ||
| 235 | |||
| 236 | if (wasEmpty) | ||
| 237 | m_parsed = true; | ||
| 238 | |||
| 239 | return true; | ||
| 240 | } | ||
| 241 | |||
| 242 | void CScraperUrl::AddParsedUrl(std::string url, | ||
| 243 | std::string aspect, | ||
| 244 | std::string preview, | ||
| 245 | std::string referrer, | ||
| 246 | std::string cache, | ||
| 247 | bool post, | ||
| 248 | bool isgz, | ||
| 249 | int season) | ||
| 250 | { | ||
| 251 | bool wasEmpty = m_data.empty(); | ||
| 252 | |||
| 253 | TiXmlElement thumb("thumb"); | ||
| 254 | thumb.SetAttribute("spoof", referrer); | ||
| 255 | thumb.SetAttribute("cache", cache); | ||
| 256 | if (post) | ||
| 257 | thumb.SetAttribute("post", "yes"); | ||
| 258 | if (isgz) | ||
| 259 | thumb.SetAttribute("gzip", "yes"); | ||
| 260 | if (season >= 0) | ||
| 261 | { | ||
| 262 | thumb.SetAttribute("season", StringUtils::Format("%i", season)); | ||
| 263 | thumb.SetAttribute("type", "season"); | ||
| 264 | } | ||
| 265 | thumb.SetAttribute("aspect", aspect); | ||
| 266 | thumb.SetAttribute("preview", preview); | ||
| 267 | TiXmlText text(url); | ||
| 268 | thumb.InsertEndChild(text); | ||
| 269 | |||
| 270 | m_data << thumb; | ||
| 271 | |||
| 272 | SUrlEntry nUrl(url); | ||
| 273 | nUrl.m_spoof = referrer; | ||
| 274 | nUrl.m_post = post; | ||
| 275 | nUrl.m_isgz = isgz; | ||
| 276 | nUrl.m_cache = cache; | ||
| 277 | if (season >= 0) | ||
| 278 | { | ||
| 279 | nUrl.m_type = UrlType::Season; | ||
| 280 | nUrl.m_season = season; | ||
| 281 | } | ||
| 282 | nUrl.m_aspect = aspect; | ||
| 283 | |||
| 284 | m_urls.push_back(nUrl); | ||
| 285 | |||
| 286 | if (wasEmpty) | ||
| 287 | m_parsed = true; | ||
| 288 | } | ||
| 289 | |||
| 290 | std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry) | ||
| 291 | { | ||
| 292 | if (entry.m_spoof.empty()) | ||
| 293 | return entry.m_url; | ||
| 294 | |||
| 295 | return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof); | ||
| 296 | } | ||
| 297 | |||
| 298 | bool CScraperUrl::Get(const SUrlEntry& scrURL, | ||
| 299 | std::string& strHTML, | ||
| 300 | XFILE::CCurlFile& http, | ||
| 301 | const std::string& cacheContext) | ||
| 302 | { | ||
| 303 | CURL url(scrURL.m_url); | ||
| 304 | http.SetReferer(scrURL.m_spoof); | ||
| 305 | std::string strCachePath; | ||
| 306 | |||
| 307 | if (!scrURL.m_cache.empty()) | ||
| 308 | { | ||
| 309 | strCachePath = URIUtils::AddFileToFolder( | ||
| 310 | CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers", | ||
| 311 | cacheContext, scrURL.m_cache); | ||
| 312 | if (XFILE::CFile::Exists(strCachePath)) | ||
| 313 | { | ||
| 314 | XFILE::CFile file; | ||
| 315 | XFILE::auto_buffer buffer; | ||
| 316 | if (file.LoadFile(strCachePath, buffer) > 0) | ||
| 317 | { | ||
| 318 | strHTML.assign(buffer.get(), buffer.length()); | ||
| 319 | return true; | ||
| 320 | } | ||
| 321 | } | ||
| 322 | } | ||
| 323 | |||
| 324 | auto strHTML1 = strHTML; | ||
| 325 | |||
| 326 | if (scrURL.m_post) | ||
| 327 | { | ||
| 328 | std::string strOptions = url.GetOptions(); | ||
| 329 | strOptions = strOptions.substr(1); | ||
| 330 | url.SetOptions(""); | ||
| 331 | |||
| 332 | if (!http.Post(url.Get(), strOptions, strHTML1)) | ||
| 333 | return false; | ||
| 334 | } | ||
| 335 | else if (!http.Get(url.Get(), strHTML1)) | ||
| 336 | return false; | ||
| 337 | |||
| 338 | strHTML = strHTML1; | ||
| 339 | |||
| 340 | const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE); | ||
| 341 | CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType); | ||
| 342 | if (ftype == CMime::FileTypeUnknown) | ||
| 343 | ftype = CMime::GetFileTypeFromContent(strHTML); | ||
| 344 | |||
| 345 | if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip) | ||
| 346 | { | ||
| 347 | XFILE::CZipFile file; | ||
| 348 | std::string strBuffer; | ||
| 349 | auto iSize = file.UnpackFromMemory( | ||
| 350 | strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz? | ||
| 351 | if (iSize > 0) | ||
| 352 | { | ||
| 353 | strHTML = strBuffer; | ||
| 354 | CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url); | ||
| 355 | } | ||
| 356 | else | ||
| 357 | CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__, | ||
| 358 | scrURL.m_url); | ||
| 359 | } | ||
| 360 | |||
| 361 | const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET); | ||
| 362 | if (ftype == CMime::FileTypeHtml) | ||
| 363 | { | ||
| 364 | std::string realHtmlCharset, converted; | ||
| 365 | if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset)) | ||
| 366 | CLog::Log(LOGWARNING, | ||
| 367 | "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback", | ||
| 368 | __FUNCTION__, scrURL.m_url, realHtmlCharset); | ||
| 369 | else | ||
| 370 | CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset, | ||
| 371 | scrURL.m_url); | ||
| 372 | |||
| 373 | strHTML = converted; | ||
| 374 | } | ||
| 375 | else if (ftype == CMime::FileTypeXml) | ||
| 376 | { | ||
| 377 | CXBMCTinyXML xmlDoc; | ||
| 378 | xmlDoc.Parse(strHTML, reportedCharset); | ||
| 379 | |||
| 380 | const auto realXmlCharset = xmlDoc.GetUsedCharset(); | ||
| 381 | if (!realXmlCharset.empty()) | ||
| 382 | { | ||
| 383 | CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset, | ||
| 384 | scrURL.m_url); | ||
| 385 | std::string converted; | ||
| 386 | g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted); | ||
| 387 | strHTML = converted; | ||
| 388 | } | ||
| 389 | } | ||
| 390 | else if (ftype == CMime::FileTypePlainText || | ||
| 391 | StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/")) | ||
| 392 | { | ||
| 393 | std::string realTextCharset; | ||
| 394 | std::string converted; | ||
| 395 | CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset); | ||
| 396 | strHTML = converted; | ||
| 397 | if (reportedCharset != realTextCharset) | ||
| 398 | CLog::Log(LOGWARNING, | ||
| 399 | "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" " | ||
| 400 | "charset", | ||
| 401 | __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset); | ||
| 402 | else | ||
| 403 | CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__, | ||
| 404 | realTextCharset, scrURL.m_url); | ||
| 405 | } | ||
| 406 | else if (!reportedCharset.empty()) | ||
| 407 | { | ||
| 408 | CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset, | ||
| 409 | scrURL.m_url); | ||
| 410 | if (reportedCharset != "UTF-8") | ||
| 411 | { | ||
| 412 | std::string converted; | ||
| 413 | g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted); | ||
| 414 | strHTML = converted; | ||
| 415 | } | ||
| 416 | } | ||
| 417 | else | ||
| 418 | CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset", | ||
| 419 | __FUNCTION__, scrURL.m_url); | ||
| 420 | |||
| 421 | if (!scrURL.m_cache.empty()) | ||
| 422 | { | ||
| 423 | const auto strCachePath = URIUtils::AddFileToFolder( | ||
| 424 | CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers", | ||
| 425 | cacheContext, scrURL.m_cache); | ||
| 426 | XFILE::CFile file; | ||
| 427 | if (!file.OpenForWrite(strCachePath, true) || | ||
| 428 | file.Write(strHTML.data(), strHTML.size()) != static_cast<ssize_t>(strHTML.size())) | ||
| 429 | return false; | ||
| 430 | } | ||
| 431 | return true; | ||
| 432 | } | ||
