internet_archive_scholar.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Internet Archive scholar(science)
  4. """
  5. from datetime import datetime
  6. from urllib.parse import urlencode
  7. from searx.utils import html_to_text
  8. about = {
  9. "website": "https://scholar.archive.org/",
  10. "wikidata_id": "Q115667709",
  11. "official_api_documentation": "https://scholar.archive.org/api/redoc",
  12. "use_official_api": True,
  13. "require_api_key": False,
  14. "results": "JSON",
  15. }
  16. categories = ['science', 'scientific publications']
  17. paging = True
  18. base_url = "https://scholar.archive.org"
  19. results_per_page = 15
  20. def request(query, params):
  21. args = {
  22. "q": query,
  23. "limit": results_per_page,
  24. "offset": (params["pageno"] - 1) * results_per_page,
  25. }
  26. params["url"] = f"{base_url}/search?{urlencode(args)}"
  27. params["headers"]["Accept"] = "application/json"
  28. return params
  29. def response(resp):
  30. results = []
  31. json = resp.json()
  32. for result in json["results"]:
  33. publishedDate, content, doi = None, '', None
  34. if result['biblio'].get('release_date'):
  35. publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
  36. if len(result['abstracts']) > 0:
  37. content = result['abstracts'][0].get('body')
  38. elif len(result['_highlights']) > 0:
  39. content = result['_highlights'][0]
  40. if len(result['releases']) > 0:
  41. doi = result['releases'][0].get('doi')
  42. results.append(
  43. {
  44. 'template': 'paper.html',
  45. 'url': result['fulltext']['access_url'],
  46. 'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
  47. 'content': html_to_text(content),
  48. 'publisher': result['biblio'].get('publisher'),
  49. 'doi': doi,
  50. 'journal': result['biblio'].get('container_name'),
  51. 'authors': result['biblio'].get('contrib_names'),
  52. 'tags': result['tags'],
  53. 'publishedDate': publishedDate,
  54. 'issns': result['biblio'].get('issns'),
  55. 'pdf_url': result['fulltext'].get('access_url'),
  56. }
  57. )
  58. return results