summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/ScraperUrl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'xbmc/utils/ScraperUrl.cpp')
-rw-r--r--xbmc/utils/ScraperUrl.cpp432
1 files changed, 432 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperUrl.cpp b/xbmc/utils/ScraperUrl.cpp
new file mode 100644
index 0000000..f242a40
--- /dev/null
+++ b/xbmc/utils/ScraperUrl.cpp
@@ -0,0 +1,432 @@
1/*
2 * Copyright (C) 2005-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
8
9#include "ScraperUrl.h"
10
11#include "CharsetConverter.h"
12#include "ServiceBroker.h"
13#include "URIUtils.h"
14#include "URL.h"
15#include "XMLUtils.h"
16#include "filesystem/CurlFile.h"
17#include "filesystem/ZipFile.h"
18#include "settings/AdvancedSettings.h"
19#include "settings/SettingsComponent.h"
20#include "utils/CharsetDetection.h"
21#include "utils/Mime.h"
22#include "utils/StringUtils.h"
23#include "utils/XBMCTinyXML.h"
24#include "utils/log.h"
25
26#include <algorithm>
27#include <cstring>
28#include <sstream>
29
30CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false)
31{
32}
33
34CScraperUrl::CScraperUrl(std::string strUrl) : CScraperUrl()
35{
36 ParseFromData(std::move(strUrl));
37}
38
39CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl()
40{
41 ParseAndAppendUrl(element);
42}
43
44CScraperUrl::~CScraperUrl() = default;
45
46void CScraperUrl::Clear()
47{
48 m_urls.clear();
49 m_data.clear();
50 m_relevance = 0.0;
51 m_parsed = false;
52}
53
54void CScraperUrl::SetData(std::string data)
55{
56 m_data = std::move(data);
57 m_parsed = false;
58}
59
60const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const
61{
62 const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) {
63 return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type);
64 });
65 if (url != m_urls.end())
66 return *url;
67
68 return SUrlEntry();
69}
70
71const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const
72{
73 const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) {
74 return url.m_type == UrlType::Season && url.m_season == season &&
75 (type.empty() || type == "thumb" || url.m_aspect == type);
76 });
77 if (url != m_urls.end())
78 return *url;
79
80 return SUrlEntry();
81}
82
83unsigned int CScraperUrl::GetMaxSeasonUrl() const
84{
85 unsigned int maxSeason = 0;
86 for (const auto& url : m_urls)
87 {
88 if (url.m_type == UrlType::Season && url.m_season > 0 &&
89 static_cast<unsigned int>(url.m_season) > maxSeason)
90 maxSeason = url.m_season;
91 }
92 return maxSeason;
93}
94
95std::string CScraperUrl::GetFirstThumbUrl() const
96{
97 if (m_urls.empty())
98 return {};
99
100 return GetThumbUrl(m_urls.front());
101}
102
103void CScraperUrl::GetThumbUrls(std::vector<std::string>& thumbs,
104 const std::string& type,
105 int season,
106 bool unique) const
107{
108 for (const auto& url : m_urls)
109 {
110 if (url.m_aspect == type || type.empty() || url.m_aspect.empty())
111 {
112 if ((url.m_type == CScraperUrl::UrlType::General && season == -1) ||
113 (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season))
114 {
115 std::string thumbUrl = GetThumbUrl(url);
116 if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end())
117 thumbs.push_back(thumbUrl);
118 }
119 }
120 }
121}
122
123bool CScraperUrl::Parse()
124{
125 if (m_parsed)
126 return true;
127
128 auto dataToParse = m_data;
129 m_data.clear();
130 return ParseFromData(std::move(dataToParse));
131}
132
133bool CScraperUrl::ParseFromData(std::string data)
134{
135 if (data.empty())
136 return false;
137
138 CXBMCTinyXML doc;
139 /* strUrl is coming from internal sources (usually generated by scraper or from database)
140 * so strUrl is always in UTF-8 */
141 doc.Parse(data, TIXML_ENCODING_UTF8);
142
143 auto pElement = doc.RootElement();
144 if (pElement == nullptr)
145 {
146 m_urls.emplace_back(data);
147 m_data = data;
148 }
149 else
150 {
151 while (pElement != nullptr)
152 {
153 ParseAndAppendUrl(pElement);
154 pElement = pElement->NextSiblingElement(pElement->Value());
155 }
156 }
157
158 m_parsed = true;
159 return true;
160}
161
162bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element)
163{
164 if (element == nullptr || element->FirstChild() == nullptr ||
165 element->FirstChild()->Value() == nullptr)
166 return false;
167
168 bool wasEmpty = m_data.empty();
169
170 std::stringstream stream;
171 stream << *element;
172 m_data += stream.str();
173
174 SUrlEntry url(element->FirstChild()->ValueStr());
175 url.m_spoof = XMLUtils::GetAttribute(element, "spoof");
176
177 const char* szPost = element->Attribute("post");
178 if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0)
179 url.m_post = true;
180 else
181 url.m_post = false;
182
183 const char* szIsGz = element->Attribute("gzip");
184 if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0)
185 url.m_isgz = true;
186 else
187 url.m_isgz = false;
188
189 url.m_cache = XMLUtils::GetAttribute(element, "cache");
190
191 const char* szType = element->Attribute("type");
192 if (szType && StringUtils::CompareNoCase(szType, "season") == 0)
193 {
194 url.m_type = UrlType::Season;
195 const char* szSeason = element->Attribute("season");
196 if (szSeason)
197 url.m_season = atoi(szSeason);
198 }
199
200 url.m_aspect = XMLUtils::GetAttribute(element, "aspect");
201
202 m_urls.push_back(url);
203
204 if (wasEmpty)
205 m_parsed = true;
206
207 return true;
208}
209
210// XML format is of strUrls is:
211// <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
212bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(std::string episodeGuide)
213{
214 if (episodeGuide.empty())
215 return false;
216
217 // ok, now parse the xml file
218 CXBMCTinyXML doc;
219 /* strUrls is coming from internal sources so strUrls is always in UTF-8 */
220 doc.Parse(episodeGuide, TIXML_ENCODING_UTF8);
221 if (doc.RootElement() == nullptr)
222 return false;
223
224 bool wasEmpty = m_data.empty();
225
226 TiXmlHandle docHandle(&doc);
227 auto link = docHandle.FirstChild("episodeguide").Element();
228 if (link->FirstChildElement("url"))
229 {
230 for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url"))
231 ParseAndAppendUrl(link);
232 }
233 else if (link->FirstChild() && link->FirstChild()->Value())
234 ParseAndAppendUrl(link);
235
236 if (wasEmpty)
237 m_parsed = true;
238
239 return true;
240}
241
242void CScraperUrl::AddParsedUrl(std::string url,
243 std::string aspect,
244 std::string preview,
245 std::string referrer,
246 std::string cache,
247 bool post,
248 bool isgz,
249 int season)
250{
251 bool wasEmpty = m_data.empty();
252
253 TiXmlElement thumb("thumb");
254 thumb.SetAttribute("spoof", referrer);
255 thumb.SetAttribute("cache", cache);
256 if (post)
257 thumb.SetAttribute("post", "yes");
258 if (isgz)
259 thumb.SetAttribute("gzip", "yes");
260 if (season >= 0)
261 {
262 thumb.SetAttribute("season", StringUtils::Format("%i", season));
263 thumb.SetAttribute("type", "season");
264 }
265 thumb.SetAttribute("aspect", aspect);
266 thumb.SetAttribute("preview", preview);
267 TiXmlText text(url);
268 thumb.InsertEndChild(text);
269
270 m_data << thumb;
271
272 SUrlEntry nUrl(url);
273 nUrl.m_spoof = referrer;
274 nUrl.m_post = post;
275 nUrl.m_isgz = isgz;
276 nUrl.m_cache = cache;
277 if (season >= 0)
278 {
279 nUrl.m_type = UrlType::Season;
280 nUrl.m_season = season;
281 }
282 nUrl.m_aspect = aspect;
283
284 m_urls.push_back(nUrl);
285
286 if (wasEmpty)
287 m_parsed = true;
288}
289
290std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry)
291{
292 if (entry.m_spoof.empty())
293 return entry.m_url;
294
295 return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof);
296}
297
298bool CScraperUrl::Get(const SUrlEntry& scrURL,
299 std::string& strHTML,
300 XFILE::CCurlFile& http,
301 const std::string& cacheContext)
302{
303 CURL url(scrURL.m_url);
304 http.SetReferer(scrURL.m_spoof);
305 std::string strCachePath;
306
307 if (!scrURL.m_cache.empty())
308 {
309 strCachePath = URIUtils::AddFileToFolder(
310 CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
311 cacheContext, scrURL.m_cache);
312 if (XFILE::CFile::Exists(strCachePath))
313 {
314 XFILE::CFile file;
315 XFILE::auto_buffer buffer;
316 if (file.LoadFile(strCachePath, buffer) > 0)
317 {
318 strHTML.assign(buffer.get(), buffer.length());
319 return true;
320 }
321 }
322 }
323
324 auto strHTML1 = strHTML;
325
326 if (scrURL.m_post)
327 {
328 std::string strOptions = url.GetOptions();
329 strOptions = strOptions.substr(1);
330 url.SetOptions("");
331
332 if (!http.Post(url.Get(), strOptions, strHTML1))
333 return false;
334 }
335 else if (!http.Get(url.Get(), strHTML1))
336 return false;
337
338 strHTML = strHTML1;
339
340 const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE);
341 CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
342 if (ftype == CMime::FileTypeUnknown)
343 ftype = CMime::GetFileTypeFromContent(strHTML);
344
345 if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
346 {
347 XFILE::CZipFile file;
348 std::string strBuffer;
349 auto iSize = file.UnpackFromMemory(
350 strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
351 if (iSize > 0)
352 {
353 strHTML = strBuffer;
354 CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url);
355 }
356 else
357 CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__,
358 scrURL.m_url);
359 }
360
361 const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET);
362 if (ftype == CMime::FileTypeHtml)
363 {
364 std::string realHtmlCharset, converted;
365 if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
366 CLog::Log(LOGWARNING,
367 "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback",
368 __FUNCTION__, scrURL.m_url, realHtmlCharset);
369 else
370 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset,
371 scrURL.m_url);
372
373 strHTML = converted;
374 }
375 else if (ftype == CMime::FileTypeXml)
376 {
377 CXBMCTinyXML xmlDoc;
378 xmlDoc.Parse(strHTML, reportedCharset);
379
380 const auto realXmlCharset = xmlDoc.GetUsedCharset();
381 if (!realXmlCharset.empty())
382 {
383 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset,
384 scrURL.m_url);
385 std::string converted;
386 g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
387 strHTML = converted;
388 }
389 }
390 else if (ftype == CMime::FileTypePlainText ||
391 StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/"))
392 {
393 std::string realTextCharset;
394 std::string converted;
395 CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
396 strHTML = converted;
397 if (reportedCharset != realTextCharset)
398 CLog::Log(LOGWARNING,
399 "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" "
400 "charset",
401 __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset);
402 else
403 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__,
404 realTextCharset, scrURL.m_url);
405 }
406 else if (!reportedCharset.empty())
407 {
408 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset,
409 scrURL.m_url);
410 if (reportedCharset != "UTF-8")
411 {
412 std::string converted;
413 g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
414 strHTML = converted;
415 }
416 }
417 else
418 CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset",
419 __FUNCTION__, scrURL.m_url);
420
421 if (!scrURL.m_cache.empty())
422 {
423 const auto strCachePath = URIUtils::AddFileToFolder(
424 CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
425 cacheContext, scrURL.m_cache);
426 XFILE::CFile file;
427 if (!file.OpenForWrite(strCachePath, true) ||
428 file.Write(strHTML.data(), strHTML.size()) != static_cast<ssize_t>(strHTML.size()))
429 return false;
430 }
431 return true;
432}