# SPDX-License-Identifier: AGPL-3.0-or-later """CORE_ (COnnecting REpositories) provides a comprehensive bibliographic database of the world’s scholarly literature, collecting and indexing research from repositories and journals. .. _CORE: https://core.ac.uk/about .. _core engine config: Configuration ============= The engine has the following additional settings: - :py:obj:`api_key` .. code:: yaml - name: core.ac.uk engine: core categories: science shortcut: cor api_key: "..." timeout: 5 Implementations =============== """ # pylint: disable=too-many-branches from datetime import datetime from urllib.parse import urlencode from searx.exceptions import SearxEngineAPIException about = { "website": 'https://core.ac.uk', "wikidata_id": 'Q22661180', "official_api_documentation": 'https://api.core.ac.uk/docs/v3', "use_official_api": True, "require_api_key": True, "results": 'JSON', } api_key = 'unset' """For an API key register at https://core.ac.uk/services/api and insert the API key in the engine :ref:`core engine config`.""" categories = ['science', 'scientific publications'] paging = True nb_per_page = 10 base_url = 'https://api.core.ac.uk/v3/search/works/' def request(query, params): if api_key == 'unset': raise SearxEngineAPIException('missing CORE API key') # API v3 uses different parameters search_params = { 'q': query, 'offset': (params['pageno'] - 1) * nb_per_page, 'limit': nb_per_page, 'sort': 'relevance', } params['url'] = base_url + '?' + urlencode(search_params) params['headers'] = {'Authorization': f'Bearer {api_key}'} return params def response(resp): results = [] json_data = resp.json() for result in json_data.get('results', []): # Get title if not result.get('title'): continue # Get URL - try different options url = None # Try DOI first doi = result.get('doi') if doi: url = f'https://doi.org/{doi}' if url is None and result.get('doi'): # use the DOI reference url = 'https://doi.org/' + str(result['doi']) elif result.get('id'): url = 'https://core.ac.uk/works/' + str(result['id']) elif result.get('downloadUrl'): url = result['downloadUrl'] elif result.get('sourceFulltextUrls'): url = result['sourceFulltextUrls'] else: continue # Published date published_date = None raw_date = result.get('publishedDate') or result.get('depositedDate') if raw_date: try: published_date = datetime.fromisoformat(result['publishedDate'].replace('Z', '+00:00')) except (ValueError, AttributeError): pass # Handle journals journals = [] if result.get('journals'): journals = [j.get('title') for j in result['journals'] if j.get('title')] # Handle publisher publisher = result.get('publisher', '').strip("'") if publisher: publisher = publisher.strip("'") # Handle authors authors = set() for i in result.get('authors', []): name = i.get("name") if name: authors.add(name) results.append( { 'template': 'paper.html', 'title': result.get('title'), 'url': url, 'content': result.get('fullText', '') or '', # 'comments': '', 'tags': result.get('fieldOfStudy', []), 'publishedDate': published_date, 'type': result.get('documentType', '') or '', 'authors': authors, 'editor': ', '.join(result.get('contributors', [])), 'publisher': publisher, 'journal': ', '.join(journals), 'doi': result.get('doi'), # 'issn' : '' # 'isbn' : '' 'pdf_url': result.get('downloadUrl', {}) or result.get("sourceFulltextUrls", {}), } ) return results