tokyotoshokan.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. """
  2. Tokyo Toshokan (A BitTorrent Library for Japanese Media)
  3. @website https://www.tokyotosho.info/
  4. @provide-api no
  5. @using-api no
  6. @results HTML
  7. @stable no (HTML can change)
  8. @parse url, title, publishedDate, seed, leech,
  9. filesize, magnetlink, content
  10. """
  11. import re
  12. from urllib.parse import urlencode
  13. from lxml import html
  14. from datetime import datetime
  15. from searx.utils import extract_text, get_torrent_size, int_or_zero
  16. # engine dependent config
  17. categories = ['files', 'videos', 'music']
  18. paging = True
  19. # search-url
  20. base_url = 'https://www.tokyotosho.info/'
  21. search_url = base_url + 'search.php?{query}'
  22. # do search-request
  23. def request(query, params):
  24. query = urlencode({'page': params['pageno'], 'terms': query})
  25. params['url'] = search_url.format(query=query)
  26. return params
  27. # get response from search-request
  28. def response(resp):
  29. results = []
  30. dom = html.fromstring(resp.text)
  31. rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]')
  32. # check if there are no results or page layout was changed so we cannot parse it
  33. # currently there are two rows for each result, so total count must be even
  34. if len(rows) == 0 or len(rows) % 2 != 0:
  35. return []
  36. # regular expression for parsing torrent size strings
  37. size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)
  38. # processing the results, two rows at a time
  39. for i in range(0, len(rows), 2):
  40. # parse the first row
  41. name_row = rows[i]
  42. links = name_row.xpath('./td[@class="desc-top"]/a')
  43. params = {
  44. 'template': 'torrent.html',
  45. 'url': links[-1].attrib.get('href'),
  46. 'title': extract_text(links[-1])
  47. }
  48. # I have not yet seen any torrents without magnet links, but
  49. # it's better to be prepared to stumble upon one some day
  50. if len(links) == 2:
  51. magnet = links[0].attrib.get('href')
  52. if magnet.startswith('magnet'):
  53. # okay, we have a valid magnet link, let's add it to the result
  54. params['magnetlink'] = magnet
  55. # no more info in the first row, start parsing the second one
  56. info_row = rows[i + 1]
  57. desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
  58. for item in desc.split('|'):
  59. item = item.strip()
  60. if item.startswith('Size:'):
  61. try:
  62. # ('1.228', 'GB')
  63. groups = size_re.match(item).groups()
  64. params['filesize'] = get_torrent_size(groups[0], groups[1])
  65. except:
  66. pass
  67. elif item.startswith('Date:'):
  68. try:
  69. # Date: 2016-02-21 21:44 UTC
  70. date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
  71. params['publishedDate'] = date
  72. except:
  73. pass
  74. elif item.startswith('Comment:'):
  75. params['content'] = item
  76. stats = info_row.xpath('./td[@class="stats"]/span')
  77. # has the layout not changed yet?
  78. if len(stats) == 3:
  79. params['seed'] = int_or_zero(extract_text(stats[0]))
  80. params['leech'] = int_or_zero(extract_text(stats[1]))
  81. results.append(params)
  82. return results