core.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """CORE_ (COnnecting REpositories) provides a comprehensive bibliographic
  3. database of the world’s scholarly literature, collecting and indexing
  4. research from repositories and journals.
  5. .. _CORE: https://core.ac.uk/about
  6. .. _core engine config:
  7. Configuration
  8. =============
  9. The engine has the following additional settings:
  10. - :py:obj:`api_key`
  11. .. code:: yaml
  12. - name: core.ac.uk
  13. engine: core
  14. categories: science
  15. shortcut: cor
  16. api_key: "..."
  17. timeout: 5
  18. Implementations
  19. ===============
  20. """
  21. # pylint: disable=too-many-branches
  22. from datetime import datetime
  23. from urllib.parse import urlencode
  24. from searx.exceptions import SearxEngineAPIException
  25. about = {
  26. "website": 'https://core.ac.uk',
  27. "wikidata_id": 'Q22661180',
  28. "official_api_documentation": 'https://api.core.ac.uk/docs/v3',
  29. "use_official_api": True,
  30. "require_api_key": True,
  31. "results": 'JSON',
  32. }
  33. api_key = 'unset'
  34. """For an API key register at https://core.ac.uk/services/api and insert
  35. the API key in the engine :ref:`core engine config`."""
  36. categories = ['science', 'scientific publications']
  37. paging = True
  38. nb_per_page = 10
  39. base_url = 'https://api.core.ac.uk/v3/search/works/'
  40. def request(query, params):
  41. if api_key == 'unset':
  42. raise SearxEngineAPIException('missing CORE API key')
  43. # API v3 uses different parameters
  44. search_params = {
  45. 'q': query,
  46. 'offset': (params['pageno'] - 1) * nb_per_page,
  47. 'limit': nb_per_page,
  48. 'sort': 'relevance',
  49. }
  50. params['url'] = base_url + '?' + urlencode(search_params)
  51. params['headers'] = {'Authorization': f'Bearer {api_key}'}
  52. return params
  53. def response(resp):
  54. results = []
  55. json_data = resp.json()
  56. for result in json_data.get('results', []):
  57. # Get title
  58. if not result.get('title'):
  59. continue
  60. # Get URL - try different options
  61. url = None
  62. # Try DOI first
  63. doi = result.get('doi')
  64. if doi:
  65. url = f'https://doi.org/{doi}'
  66. if url is None and result.get('doi'):
  67. # use the DOI reference
  68. url = 'https://doi.org/' + str(result['doi'])
  69. elif result.get('id'):
  70. url = 'https://core.ac.uk/works/' + str(result['id'])
  71. elif result.get('downloadUrl'):
  72. url = result['downloadUrl']
  73. elif result.get('sourceFulltextUrls'):
  74. url = result['sourceFulltextUrls']
  75. else:
  76. continue
  77. # Published date
  78. published_date = None
  79. raw_date = result.get('publishedDate') or result.get('depositedDate')
  80. if raw_date:
  81. try:
  82. published_date = datetime.fromisoformat(result['publishedDate'].replace('Z', '+00:00'))
  83. except (ValueError, AttributeError):
  84. pass
  85. # Handle journals
  86. journals = []
  87. if result.get('journals'):
  88. journals = [j.get('title') for j in result['journals'] if j.get('title')]
  89. # Handle publisher
  90. publisher = result.get('publisher', '').strip("'")
  91. if publisher:
  92. publisher = publisher.strip("'")
  93. # Handle authors
  94. authors = set()
  95. for i in result.get('authors', []):
  96. name = i.get("name")
  97. if name:
  98. authors.add(name)
  99. results.append(
  100. {
  101. 'template': 'paper.html',
  102. 'title': result.get('title'),
  103. 'url': url,
  104. 'content': result.get('fullText', '') or '',
  105. # 'comments': '',
  106. 'tags': result.get('fieldOfStudy', []),
  107. 'publishedDate': published_date,
  108. 'type': result.get('documentType', '') or '',
  109. 'authors': authors,
  110. 'editor': ', '.join(result.get('contributors', [])),
  111. 'publisher': publisher,
  112. 'journal': ', '.join(journals),
  113. 'doi': result.get('doi'),
  114. # 'issn' : ''
  115. # 'isbn' : ''
  116. 'pdf_url': result.get('downloadUrl', {}) or result.get("sourceFulltextUrls", {}),
  117. }
  118. )
  119. return results