naver.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=line-too-long
  3. """Naver for SearXNG"""
  4. from urllib.parse import urlencode
  5. from lxml import html
  6. from searx.exceptions import SearxEngineAPIException, SearxEngineXPathException
  7. from searx.result_types import EngineResults, MainResult
  8. from searx.utils import (
  9. eval_xpath_getindex,
  10. eval_xpath_list,
  11. eval_xpath,
  12. extract_text,
  13. extr,
  14. html_to_text,
  15. parse_duration_string,
  16. js_variable_to_python,
  17. )
  18. # engine metadata
  19. about = {
  20. "website": "https://search.naver.com",
  21. "wikidata_id": "Q485639",
  22. "use_official_api": False,
  23. "require_api_key": False,
  24. "results": "HTML",
  25. "language": "ko",
  26. }
  27. categories = []
  28. paging = True
  29. time_range_support = True
  30. time_range_dict = {"day": "1d", "week": "1w", "month": "1m", "year": "1y"}
  31. base_url = "https://search.naver.com"
  32. naver_category = "general"
  33. """Naver supports general, images, news, videos search.
  34. - ``general``: search for general
  35. - ``images``: search for images
  36. - ``news``: search for news
  37. - ``videos``: search for videos
  38. """
  39. # Naver cannot set the number of results on one page, set default value for paging
  40. naver_category_dict = {
  41. "general": {
  42. "start": 15,
  43. "where": "web",
  44. },
  45. "images": {
  46. "start": 50,
  47. "where": "image",
  48. },
  49. "news": {
  50. "start": 10,
  51. "where": "news",
  52. },
  53. "videos": {
  54. "start": 48,
  55. "where": "video",
  56. },
  57. }
  58. def init(_):
  59. if naver_category not in ('general', 'images', 'news', 'videos'):
  60. raise SearxEngineAPIException(f"Unsupported category: {naver_category}")
  61. def request(query, params):
  62. query_params = {
  63. "query": query,
  64. }
  65. if naver_category in naver_category_dict:
  66. query_params["start"] = (params["pageno"] - 1) * naver_category_dict[naver_category]["start"] + 1
  67. query_params["where"] = naver_category_dict[naver_category]["where"]
  68. if params["time_range"] in time_range_dict:
  69. query_params["nso"] = f"p:{time_range_dict[params['time_range']]}"
  70. params["url"] = f"{base_url}/search.naver?{urlencode(query_params)}"
  71. return params
  72. def response(resp) -> EngineResults:
  73. parsers = {'general': parse_general, 'images': parse_images, 'news': parse_news, 'videos': parse_videos}
  74. return parsers[naver_category](resp.text)
  75. def parse_general(data):
  76. results = EngineResults()
  77. dom = html.fromstring(data)
  78. for item in eval_xpath_list(dom, "//ul[contains(@class, 'lst_total')]/li[contains(@class, 'bx')]"):
  79. thumbnail = None
  80. try:
  81. thumbnail = eval_xpath_getindex(item, ".//div[contains(@class, 'thumb_single')]//img/@data-lazysrc", 0)
  82. except (ValueError, TypeError, SearxEngineXPathException):
  83. pass
  84. results.add(
  85. MainResult(
  86. title=extract_text(eval_xpath(item, ".//a[contains(@class, 'link_tit')]")),
  87. url=eval_xpath_getindex(item, ".//a[contains(@class, 'link_tit')]/@href", 0),
  88. content=extract_text(
  89. eval_xpath(item, ".//div[contains(@class, 'total_dsc_wrap')]//a[contains(@class, 'api_txt_lines')]")
  90. ),
  91. thumbnail=thumbnail,
  92. )
  93. )
  94. return results
  95. def parse_images(data):
  96. results = []
  97. match = extr(data, '<script>var imageSearchTabData=', '</script>')
  98. if match:
  99. json = js_variable_to_python(match.strip())
  100. items = json.get('content', {}).get('items', [])
  101. for item in items:
  102. results.append(
  103. {
  104. "template": "images.html",
  105. "url": item.get('link'),
  106. "thumbnail_src": item.get('thumb'),
  107. "img_src": item.get('originalUrl'),
  108. "title": html_to_text(item.get('title')),
  109. "source": item.get('source'),
  110. "resolution": f"{item.get('orgWidth')} x {item.get('orgHeight')}",
  111. }
  112. )
  113. return results
  114. def parse_news(data):
  115. results = EngineResults()
  116. dom = html.fromstring(data)
  117. for item in eval_xpath_list(
  118. dom, "//div[contains(@class, 'sds-comps-base-layout') and contains(@class, 'sds-comps-full-layout')]"
  119. ):
  120. title = extract_text(eval_xpath(item, ".//span[contains(@class, 'sds-comps-text-type-headline1')]/text()"))
  121. url = eval_xpath_getindex(item, ".//a[@href and @nocr='1']/@href", 0)
  122. content = extract_text(eval_xpath(item, ".//span[contains(@class, 'sds-comps-text-type-body1')]"))
  123. thumbnail = None
  124. try:
  125. thumbnail = eval_xpath_getindex(
  126. item,
  127. ".//div[contains(@class, 'sds-comps-image') and contains(@class, 'sds-rego-thumb-overlay')]//img[@src]/@src",
  128. 0,
  129. )
  130. except (ValueError, TypeError, SearxEngineXPathException):
  131. pass
  132. if title and content and url:
  133. results.add(
  134. MainResult(
  135. title=title,
  136. url=url,
  137. content=content,
  138. thumbnail=thumbnail,
  139. )
  140. )
  141. return results
  142. def parse_videos(data):
  143. results = []
  144. dom = html.fromstring(data)
  145. for item in eval_xpath_list(dom, "//li[contains(@class, 'video_item')]"):
  146. thumbnail = None
  147. try:
  148. thumbnail = eval_xpath_getindex(item, ".//img[contains(@class, 'thumb')]/@src", 0)
  149. except (ValueError, TypeError, SearxEngineXPathException):
  150. pass
  151. length = None
  152. try:
  153. length = parse_duration_string(extract_text(eval_xpath(item, ".//span[contains(@class, 'time')]")))
  154. except (ValueError, TypeError):
  155. pass
  156. results.append(
  157. {
  158. "template": "videos.html",
  159. "title": extract_text(eval_xpath(item, ".//a[contains(@class, 'info_title')]")),
  160. "url": eval_xpath_getindex(item, ".//a[contains(@class, 'info_title')]/@href", 0),
  161. "thumbnail": thumbnail,
  162. 'length': length,
  163. }
  164. )
  165. return results