json_engine.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """The JSON engine is a *generic* engine with which it is possible to configure
  3. engines in the settings.
  4. .. todo::
  5. - The JSON engine needs documentation!!
  6. - The parameters of the JSON engine should be adapted to those of the XPath
  7. engine.
  8. """
  9. from collections.abc import Iterable
  10. from json import loads
  11. from urllib.parse import urlencode
  12. from searx.utils import to_string, html_to_text
  13. # parameters for generating a request
  14. search_url = None
  15. method = 'GET'
  16. request_body = ''
  17. cookies = {}
  18. headers = {}
  19. '''Some engines might offer different result based on cookies or headers.
  20. Possible use-case: To set safesearch cookie or header to moderate.'''
  21. paging = False
  22. # parameters for engines with paging support
  23. #
  24. # number of results on each page
  25. # (only needed if the site requires not a page number, but an offset)
  26. page_size = 1
  27. # number of the first page (usually 0 or 1)
  28. first_page_num = 1
  29. # parameters for parsing the response
  30. results_query = ''
  31. url_query = None
  32. url_prefix = ""
  33. title_query = None
  34. content_query = None
  35. suggestion_query = ''
  36. title_html_to_text = False
  37. content_html_to_text = False
  38. def iterate(iterable):
  39. if isinstance(iterable, dict):
  40. items = iterable.items()
  41. else:
  42. items = enumerate(iterable)
  43. for index, value in items:
  44. yield str(index), value
  45. def is_iterable(obj):
  46. if isinstance(obj, str):
  47. return False
  48. return isinstance(obj, Iterable)
  49. def parse(query): # pylint: disable=redefined-outer-name
  50. q = [] # pylint: disable=invalid-name
  51. for part in query.split('/'):
  52. if part == '':
  53. continue
  54. q.append(part)
  55. return q
  56. def do_query(data, q): # pylint: disable=invalid-name
  57. ret = []
  58. if not q:
  59. return ret
  60. qkey = q[0]
  61. for key, value in iterate(data):
  62. if len(q) == 1:
  63. if key == qkey:
  64. ret.append(value)
  65. elif is_iterable(value):
  66. ret.extend(do_query(value, q))
  67. else:
  68. if not is_iterable(value):
  69. continue
  70. if key == qkey:
  71. ret.extend(do_query(value, q[1:]))
  72. else:
  73. ret.extend(do_query(value, q))
  74. return ret
  75. def query(data, query_string):
  76. q = parse(query_string)
  77. return do_query(data, q)
  78. def request(query, params): # pylint: disable=redefined-outer-name
  79. fp = {'query': urlencode({'q': query})[2:]} # pylint: disable=invalid-name
  80. if paging and search_url.find('{pageno}') >= 0:
  81. fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
  82. params['cookies'].update(cookies)
  83. params['headers'].update(headers)
  84. params['url'] = search_url.format(**fp)
  85. params['method'] = method
  86. if request_body:
  87. # don't url-encode the query if it's in the request body
  88. fp['query'] = query
  89. params['data'] = request_body.format(**fp)
  90. return params
  91. def identity(arg):
  92. return arg
  93. def response(resp):
  94. results = []
  95. json = loads(resp.text)
  96. title_filter = html_to_text if title_html_to_text else identity
  97. content_filter = html_to_text if content_html_to_text else identity
  98. if results_query:
  99. rs = query(json, results_query) # pylint: disable=invalid-name
  100. if not rs:
  101. return results
  102. for result in rs[0]:
  103. try:
  104. url = query(result, url_query)[0]
  105. title = query(result, title_query)[0]
  106. except: # pylint: disable=bare-except
  107. continue
  108. try:
  109. content = query(result, content_query)[0]
  110. except: # pylint: disable=bare-except
  111. content = ""
  112. results.append(
  113. {
  114. 'url': url_prefix + to_string(url),
  115. 'title': title_filter(to_string(title)),
  116. 'content': content_filter(to_string(content)),
  117. }
  118. )
  119. else:
  120. for result in json:
  121. url = query(result, url_query)[0]
  122. title = query(result, title_query)[0]
  123. content = query(result, content_query)[0]
  124. results.append(
  125. {
  126. 'url': url_prefix + to_string(url),
  127. 'title': title_filter(to_string(title)),
  128. 'content': content_filter(to_string(content)),
  129. }
  130. )
  131. if not suggestion_query:
  132. return results
  133. for suggestion in query(json, suggestion_query):
  134. results.append({'suggestion': suggestion})
  135. return results