123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- # SPDX-License-Identifier: AGPL-3.0-or-later
- """Semantic Scholar (Science)"""
- from json import dumps
- from datetime import datetime
- from lxml import html
- from flask_babel import gettext
- from searx.network import get
- from searx.utils import eval_xpath_getindex, gen_useragent, html_to_text
- about = {
- "website": 'https://www.semanticscholar.org/',
- "wikidata_id": 'Q22908627',
- "official_api_documentation": 'https://api.semanticscholar.org/',
- "use_official_api": True,
- "require_api_key": False,
- "results": 'JSON',
- }
- categories = ['science', 'scientific publications']
- paging = True
- search_url = 'https://www.semanticscholar.org/api/1/search'
- base_url = 'https://www.semanticscholar.org'
- def _get_ui_version():
- resp = get(base_url)
- if not resp.ok:
- raise RuntimeError("Can't determine Semantic Scholar UI version")
- doc = html.fromstring(resp.text)
- ui_version = eval_xpath_getindex(doc, "//meta[@name='s2-ui-version']/@content", 0)
- if not ui_version:
- raise RuntimeError("Can't determine Semantic Scholar UI version")
- return ui_version
- def request(query, params):
- params['url'] = search_url
- params['method'] = 'POST'
- params['headers'] = {
- 'Content-Type': 'application/json',
- 'X-S2-UI-Version': _get_ui_version(),
- 'X-S2-Client': "webapp-browser",
- 'User-Agent': gen_useragent(),
- }
- params['data'] = dumps(
- {
- "queryString": query,
- "page": params['pageno'],
- "pageSize": 10,
- "sort": "relevance",
- "getQuerySuggestions": False,
- "authors": [],
- "coAuthors": [],
- "venues": [],
- "performTitleMatch": True,
- }
- )
- return params
- def response(resp):
- res = resp.json()
- results = []
- for result in res['results']:
- url = result.get('primaryPaperLink', {}).get('url')
- if not url and result.get('links'):
- url = result.get('links')[0]
- if not url:
- alternatePaperLinks = result.get('alternatePaperLinks')
- if alternatePaperLinks:
- url = alternatePaperLinks[0].get('url')
- if not url:
- url = base_url + '/paper/%s' % result['id']
- # publishedDate
- if 'pubDate' in result:
- publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
- else:
- publishedDate = None
- # authors
- authors = [author[0]['name'] for author in result.get('authors', [])]
- # pick for the first alternate link, but not from the crawler
- pdf_url = None
- for doc in result.get('alternatePaperLinks', []):
- if doc['linkType'] not in ('crawler', 'doi'):
- pdf_url = doc['url']
- break
- # comments
- comments = None
- if 'citationStats' in result:
- comments = gettext(
- '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
- ).format(
- numCitations=result['citationStats']['numCitations'],
- firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
- lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
- )
- results.append(
- {
- 'template': 'paper.html',
- 'url': url,
- 'title': result['title']['text'],
- 'content': html_to_text(result['paperAbstract']['text']),
- 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
- 'doi': result.get('doiInfo', {}).get('doi'),
- 'tags': result.get('fieldsOfStudy'),
- 'authors': authors,
- 'pdf_url': pdf_url,
- 'publishedDate': publishedDate,
- 'comments': comments,
- }
- )
- return results
|