stackoverflow.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. ## Stackoverflow (It)
  2. #
  3. # @website https://stackoverflow.com/
  4. # @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
  5. #
  6. # @using-api no
  7. # @results HTML
  8. # @stable no (HTML can change)
  9. # @parse url, title, content
  10. from urlparse import urljoin
  11. from cgi import escape
  12. from urllib import urlencode
  13. from lxml import html
  14. from searx.engines.xpath import extract_text
  15. # engine dependent config
  16. categories = ['it']
  17. paging = True
  18. # search-url
  19. url = 'http://stackoverflow.com/'
  20. search_url = url+'search?{query}&page={pageno}'
  21. # specific xpath variables
  22. results_xpath = '//div[contains(@class,"question-summary")]'
  23. link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
  24. content_xpath = './/div[@class="excerpt"]'
  25. # do search-request
  26. def request(query, params):
  27. params['url'] = search_url.format(query=urlencode({'q': query}),
  28. pageno=params['pageno'])
  29. return params
  30. # get response from search-request
  31. def response(resp):
  32. results = []
  33. dom = html.fromstring(resp.text)
  34. # parse results
  35. for result in dom.xpath(results_xpath):
  36. link = result.xpath(link_xpath)[0]
  37. href = urljoin(url, link.attrib.get('href'))
  38. title = escape(extract_text(link))
  39. content = escape(extract_text(result.xpath(content_xpath)))
  40. # append result
  41. results.append({'url': href,
  42. 'title': title,
  43. 'content': content})
  44. # return results
  45. return results