startpage.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. # Startpage (Web)
  2. #
  3. # @website https://startpage.com
  4. # @provide-api no (nothing found)
  5. #
  6. # @using-api no
  7. # @results HTML
  8. # @stable no (HTML can change)
  9. # @parse url, title, content
  10. #
  11. # @todo paging
  12. from lxml import html
  13. from dateutil import parser
  14. from datetime import datetime, timedelta
  15. import re
  16. from unicodedata import normalize, combining
  17. from babel import Locale
  18. from babel.localedata import locale_identifiers
  19. from searx.utils import extract_text, eval_xpath, match_language
  20. # engine dependent config
  21. categories = ['general']
  22. # there is a mechanism to block "bot" search
  23. # (probably the parameter qid), require
  24. # storing of qid's between mulitble search-calls
  25. paging = True
  26. language_support = True
  27. supported_languages_url = 'https://www.startpage.com/do/settings'
  28. # search-url
  29. base_url = 'https://startpage.com/'
  30. search_url = base_url + 'do/search'
  31. # specific xpath variables
  32. # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
  33. # not ads: div[@class="result"] are the direct childs of div[@id="results"]
  34. results_xpath = '//div[@class="w-gl__result__main"]'
  35. link_xpath = './/a[@class="w-gl__result-url result-link"]'
  36. content_xpath = './/p[@class="w-gl__description"]'
  37. # do search-request
  38. def request(query, params):
  39. params['url'] = search_url
  40. params['method'] = 'POST'
  41. params['data'] = {
  42. 'query': query,
  43. 'page': params['pageno'],
  44. 'cat': 'web',
  45. 'cmd': 'process_search',
  46. 'engine0': 'v1all',
  47. }
  48. # set language if specified
  49. if params['language'] != 'all':
  50. lang_code = match_language(params['language'], supported_languages, fallback=None)
  51. if lang_code:
  52. language_name = supported_languages[lang_code]['alias']
  53. params['data']['language'] = language_name
  54. params['data']['lui'] = language_name
  55. return params
  56. # get response from search-request
  57. def response(resp):
  58. results = []
  59. dom = html.fromstring(resp.text)
  60. # parse results
  61. for result in eval_xpath(dom, results_xpath):
  62. links = eval_xpath(result, link_xpath)
  63. if not links:
  64. continue
  65. link = links[0]
  66. url = link.attrib.get('href')
  67. # block google-ad url's
  68. if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
  69. continue
  70. # block startpage search url's
  71. if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
  72. continue
  73. title = extract_text(link)
  74. if eval_xpath(result, content_xpath):
  75. content = extract_text(eval_xpath(result, content_xpath))
  76. else:
  77. content = ''
  78. published_date = None
  79. # check if search result starts with something like: "2 Sep 2014 ... "
  80. if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
  81. date_pos = content.find('...') + 4
  82. date_string = content[0:date_pos - 5]
  83. # fix content string
  84. content = content[date_pos:]
  85. try:
  86. published_date = parser.parse(date_string, dayfirst=True)
  87. except ValueError:
  88. pass
  89. # check if search result starts with something like: "5 days ago ... "
  90. elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
  91. date_pos = content.find('...') + 4
  92. date_string = content[0:date_pos - 5]
  93. # calculate datetime
  94. published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
  95. # fix content string
  96. content = content[date_pos:]
  97. if published_date:
  98. # append result
  99. results.append({'url': url,
  100. 'title': title,
  101. 'content': content,
  102. 'publishedDate': published_date})
  103. else:
  104. # append result
  105. results.append({'url': url,
  106. 'title': title,
  107. 'content': content})
  108. # return results
  109. return results
  110. # get supported languages from their site
  111. def _fetch_supported_languages(resp):
  112. # startpage's language selector is a mess
  113. # each option has a displayed name and a value, either of which may represent the language name
  114. # in the native script, the language name in English, an English transliteration of the native name,
  115. # the English name of the writing script used by the language, or occasionally something else entirely.
  116. # this cases are so special they need to be hardcoded, a couple of them are mispellings
  117. language_names = {
  118. 'english_uk': 'en-GB',
  119. 'fantizhengwen': ['zh-TW', 'zh-HK'],
  120. 'hangul': 'ko',
  121. 'malayam': 'ml',
  122. 'norsk': 'nb',
  123. 'sinhalese': 'si',
  124. 'sudanese': 'su'
  125. }
  126. # get the English name of every language known by babel
  127. language_names.update({name.lower(): lang_code for lang_code, name in Locale('en')._data['languages'].items()})
  128. # get the native name of every language known by babel
  129. for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
  130. native_name = Locale(lang_code).get_language_name().lower()
  131. # add native name exactly as it is
  132. language_names[native_name] = lang_code
  133. # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
  134. unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
  135. if len(unaccented_name) == len(unaccented_name.encode()):
  136. # add only if result is ascii (otherwise "normalization" didn't work)
  137. language_names[unaccented_name] = lang_code
  138. dom = html.fromstring(resp.text)
  139. sp_lang_names = []
  140. for option in dom.xpath('//form[@id="settings-form"]//select[@name="language"]/option'):
  141. sp_lang_names.append((option.get('value'), extract_text(option).lower()))
  142. supported_languages = {}
  143. for sp_option_value, sp_option_text in sp_lang_names:
  144. lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
  145. if isinstance(lang_code, str):
  146. supported_languages[lang_code] = {'alias': sp_option_value}
  147. elif isinstance(lang_code, list):
  148. for lc in lang_code:
  149. supported_languages[lc] = {'alias': sp_option_value}
  150. else:
  151. print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))
  152. return supported_languages