json_engine.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """The JSON engine is a *generic* engine with which it is possible to configure
  3. engines in the settings.
  4. .. todo::
  5. - The JSON engine needs documentation!!
  6. - The parameters of the JSON engine should be adapted to those of the XPath
  7. engine.
  8. """
  9. from collections.abc import Iterable
  10. from json import loads
  11. from urllib.parse import urlencode
  12. from searx.utils import to_string, html_to_text
  13. search_url = None
  14. url_query = None
  15. url_prefix = ""
  16. content_query = None
  17. title_query = None
  18. content_html_to_text = False
  19. title_html_to_text = False
  20. paging = False
  21. suggestion_query = ''
  22. results_query = ''
  23. cookies = {}
  24. headers = {}
  25. '''Some engines might offer different result based on cookies or headers.
  26. Possible use-case: To set safesearch cookie or header to moderate.'''
  27. # parameters for engines with paging support
  28. #
  29. # number of results on each page
  30. # (only needed if the site requires not a page number, but an offset)
  31. page_size = 1
  32. # number of the first page (usually 0 or 1)
  33. first_page_num = 1
  34. def iterate(iterable):
  35. if isinstance(iterable, dict):
  36. items = iterable.items()
  37. else:
  38. items = enumerate(iterable)
  39. for index, value in items:
  40. yield str(index), value
  41. def is_iterable(obj):
  42. if isinstance(obj, str):
  43. return False
  44. return isinstance(obj, Iterable)
  45. def parse(query): # pylint: disable=redefined-outer-name
  46. q = [] # pylint: disable=invalid-name
  47. for part in query.split('/'):
  48. if part == '':
  49. continue
  50. q.append(part)
  51. return q
  52. def do_query(data, q): # pylint: disable=invalid-name
  53. ret = []
  54. if not q:
  55. return ret
  56. qkey = q[0]
  57. for key, value in iterate(data):
  58. if len(q) == 1:
  59. if key == qkey:
  60. ret.append(value)
  61. elif is_iterable(value):
  62. ret.extend(do_query(value, q))
  63. else:
  64. if not is_iterable(value):
  65. continue
  66. if key == qkey:
  67. ret.extend(do_query(value, q[1:]))
  68. else:
  69. ret.extend(do_query(value, q))
  70. return ret
  71. def query(data, query_string):
  72. q = parse(query_string)
  73. return do_query(data, q)
  74. def request(query, params): # pylint: disable=redefined-outer-name
  75. query = urlencode({'q': query})[2:]
  76. fp = {'query': query} # pylint: disable=invalid-name
  77. if paging and search_url.find('{pageno}') >= 0:
  78. fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
  79. params['cookies'].update(cookies)
  80. params['headers'].update(headers)
  81. params['url'] = search_url.format(**fp)
  82. params['query'] = query
  83. return params
  84. def identity(arg):
  85. return arg
  86. def response(resp):
  87. results = []
  88. json = loads(resp.text)
  89. title_filter = html_to_text if title_html_to_text else identity
  90. content_filter = html_to_text if content_html_to_text else identity
  91. if results_query:
  92. rs = query(json, results_query) # pylint: disable=invalid-name
  93. if not rs:
  94. return results
  95. for result in rs[0]:
  96. try:
  97. url = query(result, url_query)[0]
  98. title = query(result, title_query)[0]
  99. except: # pylint: disable=bare-except
  100. continue
  101. try:
  102. content = query(result, content_query)[0]
  103. except: # pylint: disable=bare-except
  104. content = ""
  105. results.append(
  106. {
  107. 'url': url_prefix + to_string(url),
  108. 'title': title_filter(to_string(title)),
  109. 'content': content_filter(to_string(content)),
  110. }
  111. )
  112. else:
  113. for result in json:
  114. url = query(result, url_query)[0]
  115. title = query(result, title_query)[0]
  116. content = query(result, content_query)[0]
  117. results.append(
  118. {
  119. 'url': url_prefix + to_string(url),
  120. 'title': title_filter(to_string(title)),
  121. 'content': content_filter(to_string(content)),
  122. }
  123. )
  124. if not suggestion_query:
  125. return results
  126. for suggestion in query(json, suggestion_query):
  127. results.append({'suggestion': suggestion})
  128. return results