bing_news.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. """
  2. Bing (News)
  3. @website https://www.bing.com/news
  4. @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
  5. max. 5000 query/month
  6. @using-api no (because of query limit)
  7. @results HTML (using search portal)
  8. @stable no (HTML can change)
  9. @parse url, title, content, publishedDate
  10. """
  11. from urllib import urlencode
  12. from cgi import escape
  13. from lxml import html
  14. from datetime import datetime, timedelta
  15. from dateutil import parser
  16. import re
  17. from searx.engines.xpath import extract_text
  18. # engine dependent config
  19. categories = ['news']
  20. paging = True
  21. language_support = True
  22. # search-url
  23. base_url = 'https://www.bing.com/'
  24. search_string = 'news/search?{query}&first={offset}'
  25. # do search-request
  26. def request(query, params):
  27. offset = (params['pageno'] - 1) * 10 + 1
  28. if params['language'] == 'all':
  29. language = 'en-US'
  30. else:
  31. language = params['language'].replace('_', '-')
  32. search_path = search_string.format(
  33. query=urlencode({'q': query, 'setmkt': language}),
  34. offset=offset)
  35. params['cookies']['_FP'] = "ui=en-US"
  36. params['url'] = base_url + search_path
  37. return params
  38. # get response from search-request
  39. def response(resp):
  40. results = []
  41. dom = html.fromstring(resp.content)
  42. # parse results
  43. for result in dom.xpath('//div[@class="sn_r"]'):
  44. link = result.xpath('.//div[@class="newstitle"]/a')[0]
  45. url = link.attrib.get('href')
  46. title = extract_text(link)
  47. contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
  48. content = escape(extract_text(contentXPath))
  49. # parse publishedDate
  50. publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
  51. '//span[contains(@class,"sn_ST")]'
  52. '//span[contains(@class,"sn_tm")]')
  53. publishedDate = escape(extract_text(publishedDateXPath))
  54. if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
  55. timeNumbers = re.findall(r'\d+', publishedDate)
  56. publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
  57. elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
  58. timeNumbers = re.findall(r'\d+', publishedDate)
  59. publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
  60. elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
  61. timeNumbers = re.findall(r'\d+', publishedDate)
  62. publishedDate = datetime.now()\
  63. - timedelta(hours=int(timeNumbers[0]))\
  64. - timedelta(minutes=int(timeNumbers[1]))
  65. elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
  66. timeNumbers = re.findall(r'\d+', publishedDate)
  67. publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
  68. else:
  69. try:
  70. publishedDate = parser.parse(publishedDate, dayfirst=False)
  71. except TypeError:
  72. publishedDate = datetime.now()
  73. except ValueError:
  74. publishedDate = datetime.now()
  75. # append result
  76. results.append({'url': url,
  77. 'title': title,
  78. 'publishedDate': publishedDate,
  79. 'content': content})
  80. # return results
  81. return results