swisscows.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. Swisscows (Web, Images)
  3. @website https://swisscows.ch
  4. @provide-api no
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. """
  10. from json import loads
  11. from urllib import urlencode, unquote
  12. import re
  13. from requests import get
  14. from lxml.html import fromstring
  15. # engine dependent config
  16. categories = ['general', 'images']
  17. paging = True
  18. language_support = True
  19. # search-url
  20. base_url = 'https://swisscows.ch/'
  21. search_string = '?{query}&page={page}'
  22. # regex
  23. regex_json = re.compile(r'initialData: {"Request":(.|\n)*},\s*environment')
  24. regex_json_remove_start = re.compile(r'^initialData:\s*')
  25. regex_json_remove_end = re.compile(r',\s*environment$')
  26. regex_img_url_remove_start = re.compile(r'^https?://i\.swisscows\.ch/\?link=')
  27. # do search-request
  28. def request(query, params):
  29. if params['language'] == 'all':
  30. ui_language = 'browser'
  31. region = 'browser'
  32. elif params['language'].split('-')[0] == 'no':
  33. region = 'nb-NO'
  34. else:
  35. region = params['language']
  36. ui_language = params['language'].split('-')[0]
  37. search_path = search_string.format(
  38. query=urlencode({'query': query,
  39. 'uiLanguage': ui_language,
  40. 'region': region}),
  41. page=params['pageno'])
  42. # image search query is something like 'image?{query}&page={page}'
  43. if params['category'] == 'images':
  44. search_path = 'image' + search_path
  45. params['url'] = base_url + search_path
  46. return params
  47. # get response from search-request
  48. def response(resp):
  49. results = []
  50. json_regex = regex_json.search(resp.content)
  51. # check if results are returned
  52. if not json_regex:
  53. return []
  54. json_raw = regex_json_remove_end.sub('', regex_json_remove_start.sub('', json_regex.group()))
  55. json = loads(json_raw)
  56. # parse results
  57. for result in json['Results'].get('items', []):
  58. result_title = result['Title'].replace(u'\uE000', '').replace(u'\uE001', '')
  59. # parse image results
  60. if result.get('ContentType', '').startswith('image'):
  61. img_url = unquote(regex_img_url_remove_start.sub('', result['Url']))
  62. # append result
  63. results.append({'url': result['SourceUrl'],
  64. 'title': result['Title'],
  65. 'content': '',
  66. 'img_src': img_url,
  67. 'template': 'images.html'})
  68. # parse general results
  69. else:
  70. result_url = result['Url'].replace(u'\uE000', '').replace(u'\uE001', '')
  71. result_content = result['Description'].replace(u'\uE000', '').replace(u'\uE001', '')
  72. # append result
  73. results.append({'url': result_url,
  74. 'title': result_title,
  75. 'content': result_content})
  76. # parse images
  77. for result in json.get('Images', []):
  78. # decode image url
  79. img_url = unquote(regex_img_url_remove_start.sub('', result['Url']))
  80. # append result
  81. results.append({'url': result['SourceUrl'],
  82. 'title': result['Title'],
  83. 'content': '',
  84. 'img_src': img_url,
  85. 'template': 'images.html'})
  86. # return results
  87. return results
  88. # get supported languages from their site
  89. def fetch_supported_languages():
  90. supported_languages = []
  91. response = get(base_url)
  92. dom = fromstring(response.text)
  93. options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
  94. for option in options:
  95. code = option.xpath('./@data-val')[0]
  96. supported_languages.append(code)
  97. return supported_languages