google_images.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Google Images engine using the internal
  3. Google API used by the Google Go Android app.
  4. This internal API offer results in
  5. - JSON (``_fmt:json``)
  6. - Protobuf_ (``_fmt:pb``)
  7. - Protobuf_ compressed? (``_fmt:pc``)
  8. - HTML (``_fmt:html``)
  9. - Protobuf_ encoded in JSON (``_fmt:jspb``).
  10. .. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
  11. """
  12. from typing import TYPE_CHECKING
  13. from urllib.parse import urlencode
  14. from json import loads
  15. from searx.engines.google import fetch_traits # pylint: disable=unused-import
  16. from searx.engines.google import (
  17. get_google_info,
  18. time_range_dict,
  19. detect_google_sorry,
  20. )
  21. if TYPE_CHECKING:
  22. import logging
  23. from searx.enginelib.traits import EngineTraits
  24. logger: logging.Logger
  25. traits: EngineTraits
  26. # about
  27. about = {
  28. "website": 'https://images.google.com',
  29. "wikidata_id": 'Q521550',
  30. "official_api_documentation": 'https://developers.google.com/custom-search',
  31. "use_official_api": False,
  32. "require_api_key": False,
  33. "results": 'JSON',
  34. }
  35. # engine dependent config
  36. categories = ['images', 'web']
  37. paging = True
  38. max_page = 50
  39. """`Google max 50 pages`_
  40. .. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
  41. """
  42. time_range_support = True
  43. safesearch = True
  44. send_accept_language_header = True
  45. filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
  46. def request(query, params):
  47. """Google-Image search request"""
  48. google_info = get_google_info(params, traits)
  49. query_url = (
  50. 'https://'
  51. + google_info['subdomain']
  52. + '/search'
  53. + '?'
  54. + urlencode({'q': query, 'tbm': "isch", **google_info['params'], 'asearch': 'isch'})
  55. # don't urlencode this because wildly different AND bad results
  56. # pagination uses Zero-based numbering
  57. + f'&async=_fmt:json,p:1,ijn:{params["pageno"] - 1}'
  58. )
  59. if params['time_range'] in time_range_dict:
  60. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  61. if params['safesearch']:
  62. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  63. params['url'] = query_url
  64. params['cookies'] = google_info['cookies']
  65. params['headers'].update(google_info['headers'])
  66. # this ua will allow getting ~50 results instead of 10. #1641
  67. params['headers']['User-Agent'] = (
  68. 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12;' f' {google_info.get("country", "US")}) gzip'
  69. )
  70. return params
  71. def response(resp):
  72. """Get response from google's search request"""
  73. results = []
  74. detect_google_sorry(resp)
  75. json_start = resp.text.find('{"ischj":')
  76. json_data = loads(resp.text[json_start:])
  77. for item in json_data["ischj"].get("metadata", []):
  78. result_item = {
  79. 'url': item["result"]["referrer_url"],
  80. 'title': item["result"]["page_title"],
  81. 'content': item["text_in_grid"]["snippet"],
  82. 'source': item["result"]["site_title"],
  83. 'resolution': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
  84. 'img_src': item["original_image"]["url"],
  85. 'thumbnail_src': item["thumbnail"]["url"],
  86. 'template': 'images.html',
  87. }
  88. author = item["result"].get('iptc', {}).get('creator')
  89. if author:
  90. result_item['author'] = ', '.join(author)
  91. copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
  92. if copyright_notice:
  93. result_item['source'] += ' | ' + copyright_notice
  94. freshness_date = item["result"].get("freshness_date")
  95. if freshness_date:
  96. result_item['source'] += ' | ' + freshness_date
  97. file_size = item.get('gsa', {}).get('file_size')
  98. if file_size:
  99. result_item['source'] += ' (%s)' % file_size
  100. results.append(result_item)
  101. return results