google_scholar.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Google (Scholar)
  4. For detailed description of the *REST-full* API see: `Query Parameter
  5. Definitions`_.
  6. .. _Query Parameter Definitions:
  7. https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
  8. """
  9. # pylint: disable=invalid-name
  10. from urllib.parse import urlencode
  11. from datetime import datetime
  12. from typing import Optional
  13. from lxml import html
  14. from searx.utils import (
  15. eval_xpath,
  16. eval_xpath_getindex,
  17. eval_xpath_list,
  18. extract_text,
  19. )
  20. from searx.engines.google import (
  21. get_lang_info,
  22. time_range_dict,
  23. detect_google_sorry,
  24. )
  25. # pylint: disable=unused-import
  26. from searx.engines.google import (
  27. supported_languages_url,
  28. _fetch_supported_languages,
  29. )
  30. # pylint: enable=unused-import
  31. # about
  32. about = {
  33. "website": 'https://scholar.google.com',
  34. "wikidata_id": 'Q494817',
  35. "official_api_documentation": 'https://developers.google.com/custom-search',
  36. "use_official_api": False,
  37. "require_api_key": False,
  38. "results": 'HTML',
  39. }
  40. # engine dependent config
  41. categories = ['science', 'scientific publications']
  42. paging = True
  43. language_support = True
  44. use_locale_domain = True
  45. time_range_support = True
  46. safesearch = False
  47. send_accept_language_header = True
  48. def time_range_url(params):
  49. """Returns a URL query component for a google-Scholar time range based on
  50. ``params['time_range']``. Google-Scholar does only support ranges in years.
  51. To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*)
  52. are mapped to *year*. If no range is set, an empty string is returned.
  53. Example::
  54. &as_ylo=2019
  55. """
  56. # as_ylo=2016&as_yhi=2019
  57. ret_val = ''
  58. if params['time_range'] in time_range_dict:
  59. ret_val = urlencode({'as_ylo': datetime.now().year - 1})
  60. return '&' + ret_val
  61. def request(query, params):
  62. """Google-Scholar search request"""
  63. offset = (params['pageno'] - 1) * 10
  64. lang_info = get_lang_info(params, supported_languages, language_aliases, False)
  65. # subdomain is: scholar.google.xy
  66. lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
  67. query_url = (
  68. 'https://'
  69. + lang_info['subdomain']
  70. + '/scholar'
  71. + "?"
  72. + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset})
  73. )
  74. query_url += time_range_url(params)
  75. params['url'] = query_url
  76. params['cookies']['CONSENT'] = "YES+"
  77. params['headers'].update(lang_info['headers'])
  78. params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
  79. # params['google_subdomain'] = subdomain
  80. return params
  81. def parse_gs_a(text: Optional[str]):
  82. """Parse the text written in green.
  83. Possible formats:
  84. * "{authors} - {journal}, {year} - {publisher}"
  85. * "{authors} - {year} - {publisher}"
  86. * "{authors} - {publisher}"
  87. """
  88. if text is None or text == "":
  89. return None, None, None, None
  90. s_text = text.split(' - ')
  91. authors = s_text[0].split(', ')
  92. publisher = s_text[-1]
  93. if len(s_text) != 3:
  94. return authors, None, publisher, None
  95. # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
  96. # get journal and year
  97. journal_year = s_text[1].split(', ')
  98. # journal is optional and may contains some coma
  99. if len(journal_year) > 1:
  100. journal = ', '.join(journal_year[0:-1])
  101. if journal == '…':
  102. journal = None
  103. else:
  104. journal = None
  105. # year
  106. year = journal_year[-1]
  107. try:
  108. publishedDate = datetime.strptime(year.strip(), '%Y')
  109. except ValueError:
  110. publishedDate = None
  111. return authors, journal, publisher, publishedDate
  112. def response(resp): # pylint: disable=too-many-locals
  113. """Get response from google's search request"""
  114. results = []
  115. detect_google_sorry(resp)
  116. # which subdomain ?
  117. # subdomain = resp.search_params.get('google_subdomain')
  118. # convert the text to dom
  119. dom = html.fromstring(resp.text)
  120. # parse results
  121. for result in eval_xpath_list(dom, '//div[@data-cid]'):
  122. title = extract_text(eval_xpath(result, './/h3[1]//a'))
  123. if not title:
  124. # this is a [ZITATION] block
  125. continue
  126. pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
  127. if pub_type:
  128. pub_type = pub_type[1:-1].lower()
  129. url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
  130. content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
  131. authors, journal, publisher, publishedDate = parse_gs_a(
  132. extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
  133. )
  134. if publisher in url:
  135. publisher = None
  136. # cited by
  137. comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
  138. # link to the html or pdf document
  139. html_url = None
  140. pdf_url = None
  141. doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
  142. doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
  143. if doc_type == "[PDF]":
  144. pdf_url = doc_url
  145. else:
  146. html_url = doc_url
  147. results.append(
  148. {
  149. 'template': 'paper.html',
  150. 'type': pub_type,
  151. 'url': url,
  152. 'title': title,
  153. 'authors': authors,
  154. 'publisher': publisher,
  155. 'journal': journal,
  156. 'publishedDate': publishedDate,
  157. 'content': content,
  158. 'comments': comments,
  159. 'html_url': html_url,
  160. 'pdf_url': pdf_url,
  161. }
  162. )
  163. # parse suggestion
  164. for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
  165. # append suggestion
  166. results.append({'suggestion': extract_text(suggestion)})
  167. for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
  168. results.append({'correction': extract_text(correction)})
  169. return results