semantic_scholar.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Semantic Scholar (Science)
  4. """
  5. from json import dumps, loads
  6. from datetime import datetime
  7. from flask_babel import gettext
  8. about = {
  9. "website": 'https://www.semanticscholar.org/',
  10. "wikidata_id": 'Q22908627',
  11. "official_api_documentation": 'https://api.semanticscholar.org/',
  12. "use_official_api": True,
  13. "require_api_key": False,
  14. "results": 'JSON',
  15. }
  16. categories = ['science', 'scientific publications']
  17. paging = True
  18. search_url = 'https://www.semanticscholar.org/api/1/search'
  19. paper_url = 'https://www.semanticscholar.org/paper'
  20. def request(query, params):
  21. params['url'] = search_url
  22. params['method'] = 'POST'
  23. params['headers']['content-type'] = 'application/json'
  24. params['data'] = dumps(
  25. {
  26. "queryString": query,
  27. "page": params['pageno'],
  28. "pageSize": 10,
  29. "sort": "relevance",
  30. "getQuerySuggestions": False,
  31. "authors": [],
  32. "coAuthors": [],
  33. "venues": [],
  34. "performTitleMatch": True,
  35. }
  36. )
  37. return params
  38. def response(resp):
  39. res = loads(resp.text)
  40. results = []
  41. for result in res['results']:
  42. url = result.get('primaryPaperLink', {}).get('url')
  43. if not url and result.get('links'):
  44. url = result.get('links')[0]
  45. if not url:
  46. alternatePaperLinks = result.get('alternatePaperLinks')
  47. if alternatePaperLinks:
  48. url = alternatePaperLinks[0].get('url')
  49. if not url:
  50. url = paper_url + '/%s' % result['id']
  51. # publishedDate
  52. if 'pubDate' in result:
  53. publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
  54. else:
  55. publishedDate = None
  56. # authors
  57. authors = [author[0]['name'] for author in result.get('authors', [])]
  58. # pick for the first alternate link, but not from the crawler
  59. pdf_url = None
  60. for doc in result.get('alternatePaperLinks', []):
  61. if doc['linkType'] not in ('crawler', 'doi'):
  62. pdf_url = doc['url']
  63. break
  64. # comments
  65. comments = None
  66. if 'citationStats' in result:
  67. comments = gettext(
  68. '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
  69. ).format(
  70. numCitations=result['citationStats']['numCitations'],
  71. firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
  72. lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
  73. )
  74. results.append(
  75. {
  76. 'template': 'paper.html',
  77. 'url': url,
  78. 'title': result['title']['text'],
  79. 'content': result['paperAbstract']['text'],
  80. 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
  81. 'doi': result.get('doiInfo', {}).get('doi'),
  82. 'tags': result.get('fieldsOfStudy'),
  83. 'authors': authors,
  84. 'pdf_url': pdf_url,
  85. 'publishedDate': publishedDate,
  86. 'comments': comments,
  87. }
  88. )
  89. return results