startpage.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # Startpage (Web)
  2. #
  3. # @website https://startpage.com
  4. # @provide-api no (nothing found)
  5. #
  6. # @using-api no
  7. # @results HTML
  8. # @stable no (HTML can change)
  9. # @parse url, title, content
  10. #
  11. # @todo paging
  12. from lxml import html
  13. from cgi import escape
  14. from dateutil import parser
  15. from datetime import datetime, timedelta
  16. import re
  17. from searx.engines.xpath import extract_text
  18. # engine dependent config
  19. categories = ['general']
  20. # there is a mechanism to block "bot" search
  21. # (probably the parameter qid), require
  22. # storing of qid's between mulitble search-calls
  23. # paging = False
  24. language_support = True
  25. # search-url
  26. base_url = 'https://startpage.com/'
  27. search_url = base_url + 'do/search'
  28. # specific xpath variables
  29. # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
  30. # not ads: div[@class="result"] are the direct childs of div[@id="results"]
  31. results_xpath = '//div[@class="result"]'
  32. link_xpath = './/h3/a'
  33. # do search-request
  34. def request(query, params):
  35. offset = (params['pageno'] - 1) * 10
  36. params['url'] = search_url
  37. params['method'] = 'POST'
  38. params['data'] = {'query': query,
  39. 'startat': offset}
  40. # set language if specified
  41. if params['language'] != 'all':
  42. params['data']['with_language'] = ('lang_' + params['language'].split('_')[0])
  43. return params
  44. # get response from search-request
  45. def response(resp):
  46. results = []
  47. dom = html.fromstring(resp.content)
  48. # parse results
  49. for result in dom.xpath(results_xpath):
  50. links = result.xpath(link_xpath)
  51. if not links:
  52. continue
  53. link = links[0]
  54. url = link.attrib.get('href')
  55. # block google-ad url's
  56. if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
  57. continue
  58. # block startpage search url's
  59. if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
  60. continue
  61. # block ixquick search url's
  62. if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
  63. continue
  64. title = escape(extract_text(link))
  65. if result.xpath('./p[@class="desc clk"]'):
  66. content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
  67. else:
  68. content = ''
  69. published_date = None
  70. # check if search result starts with something like: "2 Sep 2014 ... "
  71. if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
  72. date_pos = content.find('...') + 4
  73. date_string = content[0:date_pos - 5]
  74. published_date = parser.parse(date_string, dayfirst=True)
  75. # fix content string
  76. content = content[date_pos:]
  77. # check if search result starts with something like: "5 days ago ... "
  78. elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
  79. date_pos = content.find('...') + 4
  80. date_string = content[0:date_pos - 5]
  81. # calculate datetime
  82. published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
  83. # fix content string
  84. content = content[date_pos:]
  85. if published_date:
  86. # append result
  87. results.append({'url': url,
  88. 'title': title,
  89. 'content': content,
  90. 'publishedDate': published_date})
  91. else:
  92. # append result
  93. results.append({'url': url,
  94. 'title': title,
  95. 'content': content})
  96. # return results
  97. return results