xpath.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. from lxml import html
  2. from lxml.etree import _ElementStringResult, _ElementUnicodeResult
  3. from searx.utils import html_to_text, eval_xpath
  4. from searx.url_utils import unquote, urlencode, urljoin, urlparse
  5. search_url = None
  6. url_xpath = None
  7. content_xpath = None
  8. title_xpath = None
  9. thumbnail_xpath = False
  10. paging = False
  11. suggestion_xpath = ''
  12. results_xpath = ''
  13. # parameters for engines with paging support
  14. #
  15. # number of results on each page
  16. # (only needed if the site requires not a page number, but an offset)
  17. page_size = 1
  18. # number of the first page (usually 0 or 1)
  19. first_page_num = 1
  20. '''
  21. if xpath_results is list, extract the text from each result and concat the list
  22. if xpath_results is a xml element, extract all the text node from it
  23. ( text_content() method from lxml )
  24. if xpath_results is a string element, then it's already done
  25. '''
  26. def extract_text(xpath_results):
  27. if type(xpath_results) == list:
  28. # it's list of result : concat everything using recursive call
  29. result = ''
  30. for e in xpath_results:
  31. result = result + extract_text(e)
  32. return result.strip()
  33. elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
  34. # it's a string
  35. return ''.join(xpath_results)
  36. else:
  37. # it's a element
  38. text = html.tostring(
  39. xpath_results, encoding='unicode', method='text', with_tail=False
  40. )
  41. text = text.strip().replace('\n', ' ')
  42. return ' '.join(text.split())
  43. def extract_url(xpath_results, search_url):
  44. if xpath_results == []:
  45. raise Exception('Empty url resultset')
  46. url = extract_text(xpath_results)
  47. if url.startswith('//'):
  48. # add http or https to this kind of url //example.com/
  49. parsed_search_url = urlparse(search_url)
  50. url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
  51. elif url.startswith('/'):
  52. # fix relative url to the search engine
  53. url = urljoin(search_url, url)
  54. # fix relative urls that fall through the crack
  55. if '://' not in url:
  56. url = urljoin(search_url, url)
  57. # normalize url
  58. url = normalize_url(url)
  59. return url
  60. def normalize_url(url):
  61. parsed_url = urlparse(url)
  62. # add a / at this end of the url if there is no path
  63. if not parsed_url.netloc:
  64. raise Exception('Cannot parse url')
  65. if not parsed_url.path:
  66. url += '/'
  67. # FIXME : hack for yahoo
  68. if parsed_url.hostname == 'search.yahoo.com'\
  69. and parsed_url.path.startswith('/r'):
  70. p = parsed_url.path
  71. mark = p.find('/**')
  72. if mark != -1:
  73. return unquote(p[mark + 3:]).decode('utf-8')
  74. return url
  75. def request(query, params):
  76. query = urlencode({'q': query})[2:]
  77. fp = {'query': query}
  78. if paging and search_url.find('{pageno}') >= 0:
  79. fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
  80. params['url'] = search_url.format(**fp)
  81. params['query'] = query
  82. return params
  83. def response(resp):
  84. results = []
  85. dom = html.fromstring(resp.text)
  86. if results_xpath:
  87. for result in eval_xpath(dom, results_xpath):
  88. url = extract_url(eval_xpath(result, url_xpath), search_url)
  89. title = extract_text(eval_xpath(result, title_xpath))
  90. content = extract_text(eval_xpath(result, content_xpath))
  91. tmp_result = {'url': url, 'title': title, 'content': content}
  92. # add thumbnail if available
  93. if thumbnail_xpath:
  94. thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
  95. if len(thumbnail_xpath_result) > 0:
  96. tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
  97. results.append(tmp_result)
  98. else:
  99. for url, title, content in zip(
  100. (extract_url(x, search_url) for
  101. x in eval_xpath(dom, url_xpath)),
  102. map(extract_text, eval_xpath(dom, title_xpath)),
  103. map(extract_text, eval_xpath(dom, content_xpath))
  104. ):
  105. results.append({'url': url, 'title': title, 'content': content})
  106. if not suggestion_xpath:
  107. return results
  108. for suggestion in eval_xpath(dom, suggestion_xpath):
  109. results.append({'suggestion': extract_text(suggestion)})
  110. return results