yandex.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. """
  2. Yahoo (Web)
  3. @website https://yandex.ru/
  4. @provide-api ?
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. """
  10. from cgi import escape
  11. from urllib import urlencode
  12. from lxml import html
  13. from searx.search import logger
  14. logger = logger.getChild('yandex engine')
  15. # engine dependent config
  16. categories = ['general']
  17. paging = True
  18. language_support = True # TODO
  19. default_tld = 'com'
  20. language_map = {'ru': 'ru',
  21. 'ua': 'uk',
  22. 'tr': 'com.tr'}
  23. # search-url
  24. base_url = 'https://yandex.{tld}/'
  25. search_url = 'search/?{query}&p={page}'
  26. results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
  27. url_xpath = './/h2/a/@href'
  28. title_xpath = './/h2/a//text()'
  29. content_xpath = './/div[@class="serp-item__text"]//text()'
  30. def request(query, params):
  31. lang = params['language'].split('_')[0]
  32. host = base_url.format(tld=language_map.get(lang) or default_tld)
  33. params['url'] = host + search_url.format(page=params['pageno'] - 1,
  34. query=urlencode({'text': query}))
  35. return params
  36. # get response from search-request
  37. def response(resp):
  38. dom = html.fromstring(resp.text)
  39. results = []
  40. for result in dom.xpath(results_xpath):
  41. try:
  42. res = {'url': result.xpath(url_xpath)[0],
  43. 'title': escape(''.join(result.xpath(title_xpath))),
  44. 'content': escape(''.join(result.xpath(content_xpath)))}
  45. except:
  46. logger.exception('yandex parse crash')
  47. continue
  48. results.append(res)
  49. return results