bing.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Bing (Web)
  4. """
  5. import re
  6. from urllib.parse import urlencode
  7. from lxml import html
  8. from searx import logger
  9. from searx.utils import eval_xpath, extract_text, match_language
  10. logger = logger.getChild('bing engine')
  11. # about
  12. about = {
  13. "website": 'https://www.bing.com',
  14. "wikidata_id": 'Q182496',
  15. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. }
  20. # engine dependent config
  21. categories = ['general']
  22. paging = True
  23. language_support = True
  24. supported_languages_url = 'https://www.bing.com/account/general'
  25. language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
  26. # search-url
  27. base_url = 'https://www.bing.com/'
  28. search_string = 'search?{query}&first={offset}'
  29. def _get_offset_from_pageno(pageno):
  30. return (pageno - 1) * 10 + 1
  31. # do search-request
  32. def request(query, params):
  33. offset = _get_offset_from_pageno(params.get('pageno', 0))
  34. if params['language'] == 'all':
  35. lang = 'EN'
  36. else:
  37. lang = match_language(params['language'], supported_languages, language_aliases)
  38. query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
  39. search_path = search_string.format(
  40. query=urlencode({'q': query}),
  41. offset=offset)
  42. params['url'] = base_url + search_path
  43. return params
  44. # get response from search-request
  45. def response(resp):
  46. results = []
  47. result_len = 0
  48. dom = html.fromstring(resp.text)
  49. # parse results
  50. for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
  51. link = eval_xpath(result, './/h3/a')[0]
  52. url = link.attrib.get('href')
  53. title = extract_text(link)
  54. content = extract_text(eval_xpath(result, './/p'))
  55. # append result
  56. results.append({'url': url,
  57. 'title': title,
  58. 'content': content})
  59. # parse results again if nothing is found yet
  60. for result in eval_xpath(dom, '//li[@class="b_algo"]'):
  61. link = eval_xpath(result, './/h2/a')[0]
  62. url = link.attrib.get('href')
  63. title = extract_text(link)
  64. content = extract_text(eval_xpath(result, './/p'))
  65. # append result
  66. results.append({'url': url,
  67. 'title': title,
  68. 'content': content})
  69. try:
  70. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  71. if "-" in result_len_container:
  72. # Remove the part "from-to" for paginated request ...
  73. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:]
  74. result_len_container = re.sub('[^0-9]', '', result_len_container)
  75. if len(result_len_container) > 0:
  76. result_len = int(result_len_container)
  77. except Exception as e:
  78. logger.debug('result error :\n%s', e)
  79. if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
  80. return []
  81. results.append({'number_of_results': result_len})
  82. return results
  83. # get supported languages from their site
  84. def _fetch_supported_languages(resp):
  85. lang_tags = set()
  86. setmkt = re.compile('setmkt=([^&]*)')
  87. dom = html.fromstring(resp.text)
  88. lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]")
  89. for a in lang_links:
  90. href = eval_xpath(a, './@href')[0]
  91. match = setmkt.search(href)
  92. l_tag = match.groups()[0]
  93. _lang, _nation = l_tag.split('-', 1)
  94. l_tag = _lang.lower() + '-' + _nation.upper()
  95. lang_tags.add(l_tag)
  96. return list(lang_tags)