stackoverflow.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. ## Stackoverflow (It)
  2. #
  3. # @website https://stackoverflow.com/
  4. # @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
  5. #
  6. # @using-api no
  7. # @results HTML
  8. # @stable no (HTML can change)
  9. # @parse url, title, content
  10. from urlparse import urljoin
  11. from cgi import escape
  12. from urllib import urlencode
  13. from lxml import html
  14. # engine dependent config
  15. categories = ['it']
  16. paging = True
  17. # search-url
  18. url = 'http://stackoverflow.com/'
  19. search_url = url+'search?{query}&page={pageno}'
  20. # specific xpath variables
  21. results_xpath = '//div[contains(@class,"question-summary")]'
  22. link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
  23. title_xpath = './/text()'
  24. content_xpath = './/div[@class="excerpt"]//text()'
  25. # do search-request
  26. def request(query, params):
  27. params['url'] = search_url.format(query=urlencode({'q': query}),
  28. pageno=params['pageno'])
  29. return params
  30. # get response from search-request
  31. def response(resp):
  32. results = []
  33. dom = html.fromstring(resp.text)
  34. # parse results
  35. for result in dom.xpath(results_xpath):
  36. link = result.xpath(link_xpath)[0]
  37. href = urljoin(url, link.attrib.get('href'))
  38. title = escape(' '.join(link.xpath(title_xpath)))
  39. content = escape(' '.join(result.xpath(content_xpath)))
  40. # append result
  41. results.append({'url': href,
  42. 'title': title,
  43. 'content': content})
  44. # return results
  45. return results