internet_archive_scholar.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Internet Archive scholar(science)
  3. """
  4. from datetime import datetime
  5. from urllib.parse import urlencode
  6. from searx.utils import html_to_text
  7. about = {
  8. "website": "https://scholar.archive.org/",
  9. "wikidata_id": "Q115667709",
  10. "official_api_documentation": "https://scholar.archive.org/api/redoc",
  11. "use_official_api": True,
  12. "require_api_key": False,
  13. "results": "JSON",
  14. }
  15. categories = ['science', 'scientific publications']
  16. paging = True
  17. base_url = "https://scholar.archive.org"
  18. results_per_page = 15
  19. def request(query, params):
  20. args = {
  21. "q": query,
  22. "limit": results_per_page,
  23. "offset": (params["pageno"] - 1) * results_per_page,
  24. }
  25. params["url"] = f"{base_url}/search?{urlencode(args)}"
  26. params["headers"]["Accept"] = "application/json"
  27. return params
  28. def response(resp):
  29. results = []
  30. json = resp.json()
  31. for result in json["results"]:
  32. publishedDate, content, doi = None, '', None
  33. if result['biblio'].get('release_date'):
  34. publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
  35. if len(result['abstracts']) > 0:
  36. content = result['abstracts'][0].get('body')
  37. elif len(result['_highlights']) > 0:
  38. content = result['_highlights'][0]
  39. if len(result['releases']) > 0:
  40. doi = result['releases'][0].get('doi')
  41. results.append(
  42. {
  43. 'template': 'paper.html',
  44. 'url': result['fulltext']['access_url'],
  45. 'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
  46. 'content': html_to_text(content),
  47. 'publisher': result['biblio'].get('publisher'),
  48. 'doi': doi,
  49. 'journal': result['biblio'].get('container_name'),
  50. 'authors': result['biblio'].get('contrib_names'),
  51. 'tags': result['tags'],
  52. 'publishedDate': publishedDate,
  53. 'issns': result['biblio'].get('issns'),
  54. 'pdf_url': result['fulltext'].get('access_url'),
  55. }
  56. )
  57. return results