xpath.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. from lxml import html
  2. from urllib import urlencode, unquote
  3. from urlparse import urlparse, urljoin
  4. from lxml.etree import _ElementStringResult
  5. search_url = None
  6. url_xpath = None
  7. content_xpath = None
  8. title_xpath = None
  9. suggestion_xpath = ''
  10. results_xpath = ''
  11. '''
  12. if xpath_results is list, extract the text from each result and concat the list
  13. if xpath_results is a xml element, extract all the text node from it
  14. ( text_content() method from lxml )
  15. if xpath_results is a string element, then it's already done
  16. '''
  17. def extract_text(xpath_results):
  18. if type(xpath_results) == list:
  19. # it's list of result : concat everything using recursive call
  20. if not len(xpath_results):
  21. raise Exception('Empty url resultset')
  22. result = ''
  23. for e in xpath_results:
  24. result = result + extract_text(e)
  25. return result
  26. elif type(xpath_results) == _ElementStringResult:
  27. # it's a string
  28. return ''.join(xpath_results)
  29. else:
  30. # it's a element
  31. return xpath_results.text_content()
  32. def extract_url(xpath_results):
  33. url = extract_text(xpath_results)
  34. if url.startswith('//'):
  35. # add http or https to this kind of url //example.com/
  36. parsed_search_url = urlparse(search_url)
  37. url = parsed_search_url.scheme+url
  38. elif url.startswith('/'):
  39. # fix relative url to the search engine
  40. url = urljoin(search_url, url)
  41. # normalize url
  42. url = normalize_url(url)
  43. return url
  44. def normalize_url(url):
  45. parsed_url = urlparse(url)
  46. # add a / at this end of the url if there is no path
  47. if not parsed_url.netloc:
  48. raise Exception('Cannot parse url')
  49. if not parsed_url.path:
  50. url += '/'
  51. # FIXME : hack for yahoo
  52. if parsed_url.hostname == 'search.yahoo.com'\
  53. and parsed_url.path.startswith('/r'):
  54. p = parsed_url.path
  55. mark = p.find('/**')
  56. if mark != -1:
  57. return unquote(p[mark+3:]).decode('utf-8')
  58. return url
  59. def request(query, params):
  60. query = urlencode({'q': query})[2:]
  61. params['url'] = search_url.format(query=query)
  62. params['query'] = query
  63. return params
  64. def response(resp):
  65. results = []
  66. dom = html.fromstring(resp.text)
  67. if results_xpath:
  68. for result in dom.xpath(results_xpath):
  69. url = extract_url(result.xpath(url_xpath))
  70. title = extract_text(result.xpath(title_xpath)[0])
  71. content = extract_text(result.xpath(content_xpath)[0])
  72. results.append({'url': url, 'title': title, 'content': content})
  73. else:
  74. for url, title, content in zip(
  75. map(extract_url, dom.xpath(url_xpath)),
  76. map(extract_text, dom.xpath(title_xpath)),
  77. map(extract_text, dom.xpath(content_xpath))
  78. ):
  79. results.append({'url': url, 'title': title, 'content': content})
  80. if not suggestion_xpath:
  81. return results
  82. for suggestion in dom.xpath(suggestion_xpath):
  83. results.append({'suggestion': extract_text(suggestion)})
  84. return results