microsoft_academic.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Microsoft Academic (Science)
  4. """
  5. from datetime import datetime
  6. from json import loads
  7. from uuid import uuid4
  8. from urllib.parse import urlencode
  9. from searx.utils import html_to_text
  10. # about
  11. about = {
  12. "website": 'https://academic.microsoft.com',
  13. "wikidata_id": 'Q28136779',
  14. "official_api_documentation": 'http://ma-graph.org/',
  15. "use_official_api": False,
  16. "require_api_key": False,
  17. "results": 'JSON',
  18. }
  19. categories = ['images']
  20. paging = True
  21. result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}'
  22. def request(query, params):
  23. correlation_id = uuid4()
  24. msacademic = uuid4()
  25. time_now = datetime.now()
  26. params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id}))
  27. params['cookies']['msacademic'] = str(msacademic)
  28. params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now))
  29. params['method'] = 'POST'
  30. params['data'] = {
  31. 'Query': '@{query}@'.format(query=query),
  32. 'Limit': 10,
  33. 'Offset': params['pageno'] - 1,
  34. 'Filters': '',
  35. 'OrderBy': '',
  36. 'SortAscending': False,
  37. }
  38. return params
  39. def response(resp):
  40. results = []
  41. response_data = loads(resp.text)
  42. if not response_data:
  43. return results
  44. for result in response_data['results']:
  45. url = _get_url(result)
  46. title = result['e']['dn']
  47. content = _get_content(result)
  48. results.append({
  49. 'url': url,
  50. 'title': html_to_text(title),
  51. 'content': html_to_text(content),
  52. })
  53. return results
  54. def _get_url(result):
  55. if 's' in result['e']:
  56. return result['e']['s'][0]['u']
  57. return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id'])
  58. def _get_content(result):
  59. if 'd' in result['e']:
  60. content = result['e']['d']
  61. if len(content) > 300:
  62. return content[:300] + '...'
  63. return content
  64. return ''