chinaso.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """ChinaSo_, a search engine for the chinese language area.
  3. .. attention::
  4. ChinaSo engine does not return real URL, the links from these search
  5. engines violate the privacy of the users!!
  6. We try to find a solution for this problem, please follow `issue #4694`_.
  7. As long as the problem has not been resolved, these engines are
  8. not active in a standard setup (``inactive: true``).
  9. .. _ChinaSo: https://www.chinaso.com/
  10. .. _issue #4694: https://github.com/searxng/searxng/issues/4694
  11. Configuration
  12. =============
  13. The engine has the following additional settings:
  14. - :py:obj:`chinaso_category` (:py:obj:`ChinasoCategoryType`)
  15. - :py:obj:`chinaso_news_source` (:py:obj:`ChinasoNewsSourceType`)
  16. In the example below, all three ChinaSO engines are using the :ref:`network
  17. <engine network>` from the ``chinaso news`` engine.
  18. .. code:: yaml
  19. - name: chinaso news
  20. engine: chinaso
  21. shortcut: chinaso
  22. categories: [news]
  23. chinaso_category: news
  24. chinaso_news_source: all
  25. - name: chinaso images
  26. engine: chinaso
  27. network: chinaso news
  28. shortcut: chinasoi
  29. categories: [images]
  30. chinaso_category: images
  31. - name: chinaso videos
  32. engine: chinaso
  33. network: chinaso news
  34. shortcut: chinasov
  35. categories: [videos]
  36. chinaso_category: videos
  37. Implementations
  38. ===============
  39. """
  40. import typing
  41. from urllib.parse import urlencode
  42. from datetime import datetime
  43. from searx.exceptions import SearxEngineAPIException
  44. from searx.utils import html_to_text
  45. about = {
  46. "website": "https://www.chinaso.com/",
  47. "wikidata_id": "Q10846064",
  48. "use_official_api": False,
  49. "require_api_key": False,
  50. "results": "JSON",
  51. "language": "zh",
  52. }
  53. paging = True
  54. time_range_support = True
  55. results_per_page = 10
  56. categories = []
  57. ChinasoCategoryType = typing.Literal['news', 'videos', 'images']
  58. """ChinaSo supports news, videos, images search.
  59. - ``news``: search for news
  60. - ``videos``: search for videos
  61. - ``images``: search for images
  62. In the category ``news`` you can additionally filter by option
  63. :py:obj:`chinaso_news_source`.
  64. """
  65. chinaso_category = 'news'
  66. """Configure ChinaSo category (:py:obj:`ChinasoCategoryType`)."""
  67. ChinasoNewsSourceType = typing.Literal['CENTRAL', 'LOCAL', 'BUSINESS', 'EPAPER', 'all']
  68. """Filtering ChinaSo-News results by source:
  69. - ``CENTRAL``: central publication
  70. - ``LOCAL``: local publication
  71. - ``BUSINESS``: business publication
  72. - ``EPAPER``: E-Paper
  73. - ``all``: all sources
  74. """
  75. chinaso_news_source: ChinasoNewsSourceType = 'all'
  76. """Configure ChinaSo-News type (:py:obj:`ChinasoNewsSourceType`)."""
  77. time_range_dict = {'day': '24h', 'week': '1w', 'month': '1m', 'year': '1y'}
  78. base_url = "https://www.chinaso.com"
  79. def init(_):
  80. if chinaso_category not in ('news', 'videos', 'images'):
  81. raise ValueError(f"Unsupported category: {chinaso_category}")
  82. if chinaso_category == 'news' and chinaso_news_source not in typing.get_args(ChinasoNewsSourceType):
  83. raise ValueError(f"Unsupported news source: {chinaso_news_source}")
  84. def request(query, params):
  85. query_params = {"q": query}
  86. if time_range_dict.get(params['time_range']):
  87. query_params["stime"] = time_range_dict[params['time_range']]
  88. query_params["etime"] = 'now'
  89. category_config = {
  90. 'news': {'endpoint': '/v5/general/v1/web/search', 'params': {'pn': params["pageno"], 'ps': results_per_page}},
  91. 'images': {
  92. 'endpoint': '/v5/general/v1/search/image',
  93. 'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
  94. },
  95. 'videos': {
  96. 'endpoint': '/v5/general/v1/search/video',
  97. 'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
  98. },
  99. }
  100. if chinaso_news_source != 'all':
  101. if chinaso_news_source == 'EPAPER':
  102. category_config['news']['params']["type"] = 'EPAPER'
  103. else:
  104. category_config['news']['params']["cate"] = chinaso_news_source
  105. query_params.update(category_config[chinaso_category]['params'])
  106. params["url"] = f"{base_url}{category_config[chinaso_category]['endpoint']}?{urlencode(query_params)}"
  107. return params
  108. def response(resp):
  109. try:
  110. data = resp.json()
  111. except Exception as e:
  112. raise SearxEngineAPIException(f"Invalid response: {e}") from e
  113. parsers = {'news': parse_news, 'images': parse_images, 'videos': parse_videos}
  114. return parsers[chinaso_category](data)
  115. def parse_news(data):
  116. results = []
  117. if not data.get("data", {}).get("data"):
  118. raise SearxEngineAPIException("Invalid response")
  119. for entry in data["data"]["data"]:
  120. published_date = None
  121. if entry.get("timestamp"):
  122. try:
  123. published_date = datetime.fromtimestamp(int(entry["timestamp"]))
  124. except (ValueError, TypeError):
  125. pass
  126. results.append(
  127. {
  128. 'title': html_to_text(entry["title"]),
  129. 'url': entry["url"],
  130. 'content': html_to_text(entry["snippet"]),
  131. 'publishedDate': published_date,
  132. }
  133. )
  134. return results
  135. def parse_images(data):
  136. results = []
  137. if not data.get("data", {}).get("arrRes"):
  138. raise SearxEngineAPIException("Invalid response")
  139. for entry in data["data"]["arrRes"]:
  140. results.append(
  141. {
  142. 'url': entry["web_url"],
  143. 'title': html_to_text(entry["title"]),
  144. 'content': html_to_text(entry["ImageInfo"]),
  145. 'template': 'images.html',
  146. 'img_src': entry["url"].replace("http://", "https://"),
  147. 'thumbnail_src': entry["largeimage"].replace("http://", "https://"),
  148. }
  149. )
  150. return results
  151. def parse_videos(data):
  152. results = []
  153. if not data.get("data", {}).get("arrRes"):
  154. raise SearxEngineAPIException("Invalid response")
  155. for entry in data["data"]["arrRes"]:
  156. published_date = None
  157. if entry.get("VideoPubDate"):
  158. try:
  159. published_date = datetime.fromtimestamp(int(entry["VideoPubDate"]))
  160. except (ValueError, TypeError):
  161. pass
  162. results.append(
  163. {
  164. 'url': entry["url"],
  165. 'title': html_to_text(entry["raw_title"]),
  166. 'template': 'videos.html',
  167. 'publishedDate': published_date,
  168. 'thumbnail': entry["image_src"].replace("http://", "https://"),
  169. }
  170. )
  171. return results