pubmed.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. PubMed (Scholar publications)
  4. """
  5. from lxml import etree
  6. from datetime import datetime
  7. from urllib.parse import urlencode
  8. from searx.network import get
  9. from searx.utils import (
  10. eval_xpath_getindex,
  11. eval_xpath_list,
  12. extract_text,
  13. )
  14. # about
  15. about = {
  16. "website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
  17. "wikidata_id": 'Q1540899',
  18. "official_api_documentation": {
  19. 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
  20. 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
  21. },
  22. "use_official_api": True,
  23. "require_api_key": False,
  24. "results": 'XML',
  25. }
  26. categories = ['science', 'scientific publications']
  27. base_url = (
  28. 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
  29. )
  30. # engine dependent config
  31. number_of_results = 10
  32. pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
  33. def request(query, params):
  34. # basic search
  35. offset = (params['pageno'] - 1) * number_of_results
  36. string_args = dict(query=urlencode({'term': query}), offset=offset, hits=number_of_results)
  37. params['url'] = base_url.format(**string_args)
  38. return params
  39. def response(resp):
  40. results = []
  41. # First retrieve notice of each result
  42. pubmed_retrieve_api_url = (
  43. 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
  44. )
  45. pmids_results = etree.XML(resp.content)
  46. pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
  47. pmids_string = ''
  48. for item in pmids:
  49. pmids_string += item.text + ','
  50. retrieve_notice_args = dict(pmids_string=pmids_string)
  51. retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
  52. search_results_response = get(retrieve_url_encoded).content
  53. search_results = etree.XML(search_results_response)
  54. for entry in eval_xpath_list(search_results, '//PubmedArticle'):
  55. medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
  56. title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
  57. pmid = eval_xpath_getindex(medline, './/PMID', 0).text
  58. url = pubmed_url + pmid
  59. content = extract_text(
  60. eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
  61. )
  62. doi = extract_text(
  63. eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
  64. )
  65. journal = extract_text(
  66. eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
  67. )
  68. issn = extract_text(
  69. eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
  70. )
  71. authors = []
  72. for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
  73. f = eval_xpath_getindex(author, './ForeName', 0, default=None)
  74. l = eval_xpath_getindex(author, './LastName', 0, default=None)
  75. f = '' if f is None else f.text
  76. l = '' if l is None else l.text
  77. authors.append((f + ' ' + l).strip())
  78. res_dict = {
  79. 'template': 'paper.html',
  80. 'url': url,
  81. 'title': title,
  82. 'content': content or "",
  83. 'journal': journal,
  84. 'issn': [issn],
  85. 'authors': authors,
  86. 'doi': doi,
  87. }
  88. accepted_date = eval_xpath_getindex(
  89. entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
  90. )
  91. if accepted_date is not None:
  92. year = eval_xpath_getindex(accepted_date, './Year', 0)
  93. month = eval_xpath_getindex(accepted_date, './Month', 0)
  94. day = eval_xpath_getindex(accepted_date, './Day', 0)
  95. try:
  96. publishedDate = datetime.strptime(
  97. year.text + '-' + month.text + '-' + day.text,
  98. '%Y-%m-%d',
  99. )
  100. res_dict['publishedDate'] = publishedDate
  101. except Exception as e:
  102. print(e)
  103. results.append(res_dict)
  104. return results