archlinux.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. # -*- coding: utf-8 -*-
  2. """
  3. Arch Linux Wiki
  4. @website https://wiki.archlinux.org
  5. @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it
  6. @using-api no
  7. @results HTML
  8. @stable no (HTML can change)
  9. @parse url, title, content
  10. """
  11. from urlparse import urljoin
  12. from cgi import escape
  13. from urllib import urlencode
  14. from lxml import html
  15. from searx.engines.xpath import extract_text
  16. # engine dependent config
  17. categories = ['it']
  18. language_support = True
  19. paging = True
  20. base_url = 'https://wiki.archlinux.org'
  21. # xpath queries
  22. xpath_results = '//ul[@class="mw-search-results"]/li'
  23. xpath_link = './/div[@class="mw-search-result-heading"]/a'
  24. xpath_content = './/div[@class="searchresult"]'
  25. # cut 'en' from 'en_US', 'de' from 'de_CH', and so on
  26. def locale_to_lang_code(locale):
  27. if locale.find('_') >= 0:
  28. locale = locale.split('_')[0]
  29. return locale
  30. # wikis for some languages were moved off from the main site, we need to make
  31. # requests to correct URLs to be able to get results in those languages
  32. lang_urls = {
  33. 'all': {
  34. 'base': 'https://wiki.archlinux.org',
  35. 'search': '/index.php?title=Special:Search&offset={offset}&{query}'
  36. },
  37. 'de': {
  38. 'base': 'https://wiki.archlinux.de',
  39. 'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}'
  40. },
  41. 'fr': {
  42. 'base': 'https://wiki.archlinux.fr',
  43. 'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}'
  44. },
  45. 'ja': {
  46. 'base': 'https://wiki.archlinuxjp.org',
  47. 'search': '/index.php?title=特別:検索&offset={offset}&{query}'
  48. },
  49. 'ro': {
  50. 'base': 'http://wiki.archlinux.ro',
  51. 'search': '/index.php?title=Special:Căutare&offset={offset}&{query}'
  52. },
  53. 'tr': {
  54. 'base': 'http://archtr.org/wiki',
  55. 'search': '/index.php?title=Özel:Ara&offset={offset}&{query}'
  56. }
  57. }
  58. # get base & search URLs for selected language
  59. def get_lang_urls(language):
  60. if language in lang_urls:
  61. return lang_urls[language]
  62. return lang_urls['all']
  63. # Language names to build search requests for
  64. # those languages which are hosted on the main site.
  65. main_langs = {
  66. 'ar': 'العربية',
  67. 'bg': 'Български',
  68. 'cs': 'Česky',
  69. 'da': 'Dansk',
  70. 'el': 'Ελληνικά',
  71. 'es': 'Español',
  72. 'he': 'עברית',
  73. 'hr': 'Hrvatski',
  74. 'hu': 'Magyar',
  75. 'it': 'Italiano',
  76. 'ko': '한국어',
  77. 'lt': 'Lietuviškai',
  78. 'nl': 'Nederlands',
  79. 'pl': 'Polski',
  80. 'pt': 'Português',
  81. 'ru': 'Русский',
  82. 'sl': 'Slovenský',
  83. 'th': 'ไทย',
  84. 'uk': 'Українська',
  85. 'zh': '简体中文'
  86. }
  87. # do search-request
  88. def request(query, params):
  89. # translate the locale (e.g. 'en_US') to language code ('en')
  90. language = locale_to_lang_code(params['language'])
  91. # if our language is hosted on the main site, we need to add its name
  92. # to the query in order to narrow the results to that language
  93. if language in main_langs:
  94. query += '(' + main_langs[language] + ')'
  95. # prepare the request parameters
  96. query = urlencode({'search': query})
  97. offset = (params['pageno'] - 1) * 20
  98. # get request URLs for our language of choice
  99. urls = get_lang_urls(language)
  100. search_url = urls['base'] + urls['search']
  101. params['url'] = search_url.format(query=query, offset=offset)
  102. return params
  103. # get response from search-request
  104. def response(resp):
  105. # get the base URL for the language in which request was made
  106. language = locale_to_lang_code(resp.search_params['language'])
  107. base_url = get_lang_urls(language)['base']
  108. results = []
  109. dom = html.fromstring(resp.text)
  110. # parse results
  111. for result in dom.xpath(xpath_results):
  112. link = result.xpath(xpath_link)[0]
  113. href = urljoin(base_url, link.attrib.get('href'))
  114. title = escape(extract_text(link))
  115. content = escape(extract_text(result.xpath(xpath_content)))
  116. results.append({'url': href,
  117. 'title': title,
  118. 'content': content})
  119. return results