swisscows.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. """
  2. Swisscows (Web)
  3. @website https://swisscows.ch
  4. @provide-api no
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. """
  10. from json import loads
  11. from urllib import urlencode, unquote
  12. import re
  13. # engine dependent config
  14. categories = ['general']
  15. paging = True
  16. language_support = True
  17. # search-url
  18. base_url = 'https://swisscows.ch/'
  19. search_string = '?{query}&page={page}'
  20. # regex
  21. regex_json = re.compile('initialData: {"Request":(.|\n)*}\]},\s*environment')
  22. regex_json_remove_start = re.compile('^initialData:\s*')
  23. regex_json_remove_end = re.compile(',\s*environment$')
  24. regex_img_url_remove_start = re.compile('^https?://i\.swisscows\.ch/\?link=')
  25. # do search-request
  26. def request(query, params):
  27. if params['language'] == 'all':
  28. ui_language = 'browser'
  29. region = 'browser'
  30. else:
  31. region = params['language'].replace('_', '-')
  32. ui_language = params['language'].split('_')[0]
  33. search_path = search_string.format(
  34. query=urlencode({'query': query,
  35. 'uiLanguage': ui_language,
  36. 'region': region}),
  37. page=params['pageno'])
  38. params['url'] = base_url + search_path
  39. return params
  40. # get response from search-request
  41. def response(resp):
  42. results = []
  43. json_regex = regex_json.search(resp.content)
  44. # check if results are returned
  45. if not json_regex:
  46. return []
  47. json_raw = regex_json_remove_end.sub('', regex_json_remove_start.sub('', json_regex.group()))
  48. json = loads(json_raw)
  49. # parse normal results
  50. for result in json['Results'].get('items', []):
  51. # append result
  52. results.append({'url': result['Url'].replace(u'\uE000', '').replace(u'\uE001', ''),
  53. 'title': result['Title'].replace(u'\uE000', '').replace(u'\uE001', ''),
  54. 'content': result['Description'].replace(u'\uE000', '').replace(u'\uE001', '')})
  55. # parse images
  56. for result in json.get('Images', []):
  57. # decode image url
  58. img_url = unquote(regex_img_url_remove_start.sub('', result['Url']))
  59. # append result
  60. results.append({'url': result['SourceUrl'],
  61. 'title': result['Title'],
  62. 'content': '',
  63. 'img_src': img_url,
  64. 'template': 'images.html'})
  65. # return results
  66. return results