microsoft_academic.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. """
  2. Microsoft Academic (Science)
  3. @website https://academic.microsoft.com
  4. @provide-api yes
  5. @using-api no
  6. @results JSON
  7. @stable no
  8. @parse url, title, content
  9. """
  10. from datetime import datetime
  11. from json import loads
  12. from uuid import uuid4
  13. from urllib.parse import urlencode
  14. from searx.utils import html_to_text
  15. categories = ['images']
  16. paging = True
  17. result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}'
  18. def request(query, params):
  19. correlation_id = uuid4()
  20. msacademic = uuid4()
  21. time_now = datetime.now()
  22. params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id}))
  23. params['cookies']['msacademic'] = str(msacademic)
  24. params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now))
  25. params['method'] = 'POST'
  26. params['data'] = {
  27. 'Query': '@{query}@'.format(query=query),
  28. 'Limit': 10,
  29. 'Offset': params['pageno'] - 1,
  30. 'Filters': '',
  31. 'OrderBy': '',
  32. 'SortAscending': False,
  33. }
  34. return params
  35. def response(resp):
  36. results = []
  37. response_data = loads(resp.text)
  38. if not response_data:
  39. return results
  40. for result in response_data['results']:
  41. url = _get_url(result)
  42. title = result['e']['dn']
  43. content = _get_content(result)
  44. results.append({
  45. 'url': url,
  46. 'title': html_to_text(title),
  47. 'content': html_to_text(content),
  48. })
  49. return results
  50. def _get_url(result):
  51. if 's' in result['e']:
  52. return result['e']['s'][0]['u']
  53. return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id'])
  54. def _get_content(result):
  55. if 'd' in result['e']:
  56. content = result['e']['d']
  57. if len(content) > 300:
  58. return content[:300] + '...'
  59. return content
  60. return ''