yandex.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. """
  2. Yahoo (Web)
  3. @website https://yandex.ru/
  4. @provide-api ?
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. """
  10. from urllib.parse import urlencode, urlparse
  11. from lxml import html
  12. from searx import logger
  13. from searx.exceptions import SearxEngineCaptchaException
  14. logger = logger.getChild('yandex engine')
  15. # engine dependent config
  16. categories = ['general']
  17. paging = True
  18. language_support = True # TODO
  19. default_tld = 'com'
  20. language_map = {'ru': 'ru',
  21. 'ua': 'ua',
  22. 'be': 'by',
  23. 'kk': 'kz',
  24. 'tr': 'com.tr'}
  25. # search-url
  26. base_url = 'https://yandex.{tld}/'
  27. search_url = 'search/?{query}&p={page}'
  28. results_xpath = '//li[@class="serp-item"]'
  29. url_xpath = './/h2/a/@href'
  30. title_xpath = './/h2/a//text()'
  31. content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()'
  32. def request(query, params):
  33. lang = params['language'].split('-')[0]
  34. host = base_url.format(tld=language_map.get(lang) or default_tld)
  35. params['url'] = host + search_url.format(page=params['pageno'] - 1,
  36. query=urlencode({'text': query}))
  37. return params
  38. # get response from search-request
  39. def response(resp):
  40. resp_url = urlparse(resp.url)
  41. if resp_url.path.startswith('/showcaptcha'):
  42. raise SearxEngineCaptchaException()
  43. dom = html.fromstring(resp.text)
  44. results = []
  45. for result in dom.xpath(results_xpath):
  46. try:
  47. res = {'url': result.xpath(url_xpath)[0],
  48. 'title': ''.join(result.xpath(title_xpath)),
  49. 'content': ''.join(result.xpath(content_xpath))}
  50. except:
  51. logger.exception('yandex parse crash')
  52. continue
  53. results.append(res)
  54. return results