google_scholar.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Google Scholar engine.
  4. Compared to other Google services the Scholar engine has a simple GET REST-API
  5. and there does not exists `async` API. Even though the API slightly vintage we
  6. can make use of the :ref:`google API` to assemble the arguments of the GET
  7. request.
  8. """
  9. from typing import TYPE_CHECKING
  10. from typing import Optional
  11. from urllib.parse import urlencode
  12. from datetime import datetime
  13. from lxml import html
  14. from searx.utils import (
  15. eval_xpath,
  16. eval_xpath_getindex,
  17. eval_xpath_list,
  18. extract_text,
  19. )
  20. from searx.exceptions import SearxEngineCaptchaException
  21. from searx.engines.google import fetch_traits # pylint: disable=unused-import
  22. from searx.engines.google import (
  23. get_google_info,
  24. time_range_dict,
  25. )
  26. from searx.enginelib.traits import EngineTraits
  27. if TYPE_CHECKING:
  28. import logging
  29. logger: logging.Logger
  30. traits: EngineTraits
  31. # about
  32. about = {
  33. "website": 'https://scholar.google.com',
  34. "wikidata_id": 'Q494817',
  35. "official_api_documentation": 'https://developers.google.com/custom-search',
  36. "use_official_api": False,
  37. "require_api_key": False,
  38. "results": 'HTML',
  39. }
  40. # engine dependent config
  41. categories = ['science', 'scientific publications']
  42. paging = True
  43. max_page = 50
  44. language_support = True
  45. time_range_support = True
  46. safesearch = False
  47. send_accept_language_header = True
  48. def time_range_args(params):
  49. """Returns a dictionary with a time range arguments based on
  50. ``params['time_range']``.
  51. Google Scholar supports a detailed search by year. Searching by *last
  52. month* or *last week* (as offered by SearXNG) is uncommon for scientific
  53. publications and is not supported by Google Scholar.
  54. To limit the result list when the users selects a range, all the SearXNG
  55. ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
  56. is set an empty dictionary of arguments is returned. Example; when
  57. user selects a time range (current year minus one in 2022):
  58. .. code:: python
  59. { 'as_ylo' : 2021 }
  60. """
  61. ret_val = {}
  62. if params['time_range'] in time_range_dict:
  63. ret_val['as_ylo'] = datetime.now().year - 1
  64. return ret_val
  65. def detect_google_captcha(dom):
  66. """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
  67. not redirected to ``sorry.google.com``.
  68. """
  69. if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
  70. raise SearxEngineCaptchaException()
  71. def request(query, params):
  72. """Google-Scholar search request"""
  73. google_info = get_google_info(params, traits)
  74. # subdomain is: scholar.google.xy
  75. google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
  76. args = {
  77. 'q': query,
  78. **google_info['params'],
  79. 'start': (params['pageno'] - 1) * 10,
  80. 'as_sdt': '2007', # include patents / to disable set '0,5'
  81. 'as_vis': '0', # include citations / to disable set '1'
  82. }
  83. args.update(time_range_args(params))
  84. params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
  85. params['cookies'] = google_info['cookies']
  86. params['headers'].update(google_info['headers'])
  87. return params
  88. def parse_gs_a(text: Optional[str]):
  89. """Parse the text written in green.
  90. Possible formats:
  91. * "{authors} - {journal}, {year} - {publisher}"
  92. * "{authors} - {year} - {publisher}"
  93. * "{authors} - {publisher}"
  94. """
  95. if text is None or text == "":
  96. return None, None, None, None
  97. s_text = text.split(' - ')
  98. authors = s_text[0].split(', ')
  99. publisher = s_text[-1]
  100. if len(s_text) != 3:
  101. return authors, None, publisher, None
  102. # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
  103. # get journal and year
  104. journal_year = s_text[1].split(', ')
  105. # journal is optional and may contains some coma
  106. if len(journal_year) > 1:
  107. journal = ', '.join(journal_year[0:-1])
  108. if journal == '…':
  109. journal = None
  110. else:
  111. journal = None
  112. # year
  113. year = journal_year[-1]
  114. try:
  115. publishedDate = datetime.strptime(year.strip(), '%Y')
  116. except ValueError:
  117. publishedDate = None
  118. return authors, journal, publisher, publishedDate
  119. def response(resp): # pylint: disable=too-many-locals
  120. """Parse response from Google Scholar"""
  121. results = []
  122. # convert the text to dom
  123. dom = html.fromstring(resp.text)
  124. detect_google_captcha(dom)
  125. # parse results
  126. for result in eval_xpath_list(dom, '//div[@data-rp]'):
  127. title = extract_text(eval_xpath(result, './/h3[1]//a'))
  128. if not title:
  129. # this is a [ZITATION] block
  130. continue
  131. pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
  132. if pub_type:
  133. pub_type = pub_type[1:-1].lower()
  134. url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
  135. content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
  136. authors, journal, publisher, publishedDate = parse_gs_a(
  137. extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
  138. )
  139. if publisher in url:
  140. publisher = None
  141. # cited by
  142. comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
  143. # link to the html or pdf document
  144. html_url = None
  145. pdf_url = None
  146. doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
  147. doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
  148. if doc_type == "[PDF]":
  149. pdf_url = doc_url
  150. else:
  151. html_url = doc_url
  152. results.append(
  153. {
  154. 'template': 'paper.html',
  155. 'type': pub_type,
  156. 'url': url,
  157. 'title': title,
  158. 'authors': authors,
  159. 'publisher': publisher,
  160. 'journal': journal,
  161. 'publishedDate': publishedDate,
  162. 'content': content,
  163. 'comments': comments,
  164. 'html_url': html_url,
  165. 'pdf_url': pdf_url,
  166. }
  167. )
  168. # parse suggestion
  169. for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
  170. # append suggestion
  171. results.append({'suggestion': extract_text(suggestion)})
  172. for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
  173. results.append({'correction': extract_text(correction)})
  174. return results