bing_news.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. ## Bing (News)
  2. #
  3. # @website https://www.bing.com/news
  4. # @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
  5. # max. 5000 query/month
  6. #
  7. # @using-api no (because of query limit)
  8. # @results HTML (using search portal)
  9. # @stable no (HTML can change)
  10. # @parse url, title, content, publishedDate
  11. from urllib import urlencode
  12. from cgi import escape
  13. from lxml import html
  14. from datetime import datetime, timedelta
  15. from dateutil import parser
  16. import re
  17. # engine dependent config
  18. categories = ['news']
  19. paging = True
  20. language_support = True
  21. # search-url
  22. base_url = 'https://www.bing.com/'
  23. search_string = 'news/search?{query}&first={offset}'
  24. # do search-request
  25. def request(query, params):
  26. offset = (params['pageno'] - 1) * 10 + 1
  27. if params['language'] == 'all':
  28. language = 'en-US'
  29. else:
  30. language = params['language'].replace('_', '-')
  31. search_path = search_string.format(
  32. query=urlencode({'q': query, 'setmkt': language}),
  33. offset=offset)
  34. params['cookies']['_FP'] = "ui=en-US"
  35. params['url'] = base_url + search_path
  36. return params
  37. # get response from search-request
  38. def response(resp):
  39. results = []
  40. dom = html.fromstring(resp.content)
  41. # parse results
  42. for result in dom.xpath('//div[@class="sn_r"]'):
  43. link = result.xpath('.//div[@class="newstitle"]/a')[0]
  44. url = link.attrib.get('href')
  45. title = ' '.join(link.xpath('.//text()'))
  46. contentXPath = result.xpath('.//div[@class="sn_txt"]/div'
  47. '//span[@class="sn_snip"]//text()')
  48. if contentXPath is not None:
  49. content = escape(' '.join(contentXPath))
  50. # parse publishedDate
  51. publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
  52. '//span[contains(@class,"sn_ST")]'
  53. '//span[contains(@class,"sn_tm")]'
  54. '//text()')
  55. if publishedDateXPath is not None:
  56. publishedDate = escape(' '.join(publishedDateXPath))
  57. if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
  58. timeNumbers = re.findall(r'\d+', publishedDate)
  59. publishedDate = datetime.now()\
  60. - timedelta(minutes=int(timeNumbers[0]))
  61. elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
  62. timeNumbers = re.findall(r'\d+', publishedDate)
  63. publishedDate = datetime.now()\
  64. - timedelta(hours=int(timeNumbers[0]))
  65. elif re.match("^[0-9]+ hour(s|),"
  66. " [0-9]+ minute(s|) ago$", publishedDate):
  67. timeNumbers = re.findall(r'\d+', publishedDate)
  68. publishedDate = datetime.now()\
  69. - timedelta(hours=int(timeNumbers[0]))\
  70. - timedelta(minutes=int(timeNumbers[1]))
  71. elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
  72. timeNumbers = re.findall(r'\d+', publishedDate)
  73. publishedDate = datetime.now()\
  74. - timedelta(days=int(timeNumbers[0]))
  75. else:
  76. try:
  77. # FIXME use params['language'] to parse either mm/dd or dd/mm
  78. publishedDate = parser.parse(publishedDate, dayfirst=False)
  79. except TypeError:
  80. # FIXME
  81. publishedDate = datetime.now()
  82. # append result
  83. results.append({'url': url,
  84. 'title': title,
  85. 'publishedDate': publishedDate,
  86. 'content': content})
  87. # return results
  88. return results