yahoo.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. """
  2. Yahoo (Web)
  3. @website https://search.yahoo.com/web
  4. @provide-api yes (https://developer.yahoo.com/boss/search/),
  5. $0.80/1000 queries
  6. @using-api no (because pricing)
  7. @results HTML (using search portal)
  8. @stable no (HTML can change)
  9. @parse url, title, content, suggestion
  10. """
  11. from urllib import urlencode
  12. from urlparse import unquote
  13. from lxml import html
  14. from requests import get
  15. from searx.engines.xpath import extract_text, extract_url
  16. # engine dependent config
  17. categories = ['general']
  18. paging = True
  19. language_support = True
  20. time_range_support = True
  21. # search-url
  22. base_url = 'https://search.yahoo.com/'
  23. search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
  24. search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
  25. supported_languages_url = 'https://search.yahoo.com/web/advanced'
  26. # specific xpath variables
  27. results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
  28. url_xpath = './/h3/a/@href'
  29. title_xpath = './/h3/a'
  30. content_xpath = './/div[@class="compText aAbs"]'
  31. suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a"
  32. time_range_dict = {'day': ['1d', 'd'],
  33. 'week': ['1w', 'w'],
  34. 'month': ['1m', 'm']}
  35. # remove yahoo-specific tracking-url
  36. def parse_url(url_string):
  37. endings = ['/RS', '/RK']
  38. endpositions = []
  39. start = url_string.find('http', url_string.find('/RU=') + 1)
  40. for ending in endings:
  41. endpos = url_string.rfind(ending)
  42. if endpos > -1:
  43. endpositions.append(endpos)
  44. if start == 0 or len(endpositions) == 0:
  45. return url_string
  46. else:
  47. end = min(endpositions)
  48. return unquote(url_string[start:end])
  49. def _get_url(query, offset, language, time_range):
  50. if time_range in time_range_dict:
  51. return base_url + search_url_with_time.format(offset=offset,
  52. query=urlencode({'p': query}),
  53. lang=language,
  54. age=time_range_dict[time_range][0],
  55. btf=time_range_dict[time_range][1])
  56. return base_url + search_url.format(offset=offset,
  57. query=urlencode({'p': query}),
  58. lang=language)
  59. def _get_language(params):
  60. if params['language'] == 'all':
  61. return 'en'
  62. elif params['language'][:2] == 'zh':
  63. if params['language'] == 'zh' or params['language'] == 'zh-CH':
  64. return 'szh'
  65. else:
  66. return 'tzh'
  67. else:
  68. return params['language'].split('-')[0]
  69. # do search-request
  70. def request(query, params):
  71. if params['time_range'] and params['time_range'] not in time_range_dict:
  72. return params
  73. offset = (params['pageno'] - 1) * 10 + 1
  74. language = _get_language(params)
  75. params['url'] = _get_url(query, offset, language, params['time_range'])
  76. # TODO required?
  77. params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
  78. .format(lang=language)
  79. return params
  80. # get response from search-request
  81. def response(resp):
  82. results = []
  83. dom = html.fromstring(resp.text)
  84. try:
  85. results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0]
  86. .split()[0].replace(',', ''))
  87. results.append({'number_of_results': results_num})
  88. except:
  89. pass
  90. # parse results
  91. for result in dom.xpath(results_xpath):
  92. try:
  93. url = parse_url(extract_url(result.xpath(url_xpath), search_url))
  94. title = extract_text(result.xpath(title_xpath)[0])
  95. except:
  96. continue
  97. content = extract_text(result.xpath(content_xpath)[0])
  98. # append result
  99. results.append({'url': url,
  100. 'title': title,
  101. 'content': content})
  102. # if no suggestion found, return results
  103. suggestions = dom.xpath(suggestion_xpath)
  104. if not suggestions:
  105. return results
  106. # parse suggestion
  107. for suggestion in suggestions:
  108. # append suggestion
  109. results.append({'suggestion': extract_text(suggestion)})
  110. # return results
  111. return results
  112. # get supported languages from their site
  113. def fetch_supported_languages():
  114. supported_languages = []
  115. response = get(supported_languages_url)
  116. dom = html.fromstring(response.text)
  117. options = dom.xpath('//div[@id="yschlang"]/span/label/input')
  118. for option in options:
  119. code = option.xpath('./@value')[0][5:]
  120. supported_languages.append(code)
  121. return supported_languages