/piratebaybua/parser.py

https://github.com/priendeau/PirateBay-BUA · Python · 100 lines · 60 code · 8 blank · 32 comment · 22 complexity · f4a18192c83e701ec1cd3a2d7473eaec MD5 · raw file

  1. # Copyright (c) Alexander Borgerth 2010, Copyright (c) 2005-2010 re-edited by Maxiste Deams, Patrick Riendeau, Rheault Etccy for
  2. # Upcoming Security audit in Canada.
  3. # See LICENSE for details.
  4. from urlparse import urljoin
  5. from piratebaybua import exceptions
  6. _xpath_table = ".//div[@id='content']/div[@id='main-content']/table[@id='searchResult']"
  7. _xpath_amount_of_pages = ".//div[@align='center']"
  8. def find_table(doc):
  9. """Search for the result table on any page on piratebay.
  10. Raise TableNotFound to indicate error.
  11. """
  12. table = doc.xpath(_xpath_table)
  13. if len(table) <= 0:
  14. raise exceptions.TableNotFound("Unable to find table")
  15. return table[0]
  16. def iterate_over_rows(table):
  17. """Iterate over the rows of the table.
  18. Raise InvalidTable to indicate error.
  19. """
  20. if table.tag != 'table':
  21. raise exceptions.InvalidTable("Invalid table")
  22. for row in table.iterchildren():
  23. if row.tag != 'tr':
  24. continue
  25. yield row
  26. def process_row(row):
  27. """Process one row of data.
  28. Return the results in a dictionary, possible keys are:
  29. - name = Name of torrent.
  30. - torrent-info-url = (Full) url to the torrent info page.
  31. - torrent-url = (Full) url to the torrent file.
  32. - magnet-url = (Full) url to the magnet file.
  33. - user = (Full) url to the user page(shows all releases by the user).
  34. - seeders = Amount of people seeding the torrent.
  35. - leechers = Amount of people leeching the torrent.
  36. """
  37. data = {}
  38. columns = row.getchildren()[1:]
  39. if len(columns) != 3:
  40. raise exceptions.InvalidRow("Row isn't valid or it doesn't contain the columns it should.")
  41. for ele in columns[0].iterchildren():
  42. if ele.tag == 'div' and ele.get('class') == 'detName':
  43. a = ele.find('a')
  44. data["torrent-info-url"] = urljoin(ele.base, a.get('href'))
  45. data["name"] = a.text_content()
  46. elif ele.tag == 'a':
  47. if ele.get('title') == "Download this torrent":
  48. data["torrent-url"] = ele.get("href")
  49. elif ele.get('title') == "Download this torrent using magnet":
  50. data["magnet-url"] = ele.get("href")
  51. elif ele.tag == 'font':
  52. a = ele.find('a')
  53. if a is None:
  54. data['user'] = "Anonymous"
  55. else:
  56. data['user'] = urljoin(ele.base, a.get('href'))
  57. data['seeders'] = int(columns[1].text_content().strip())
  58. data['leechers'] = int(columns[2].text_content().strip())
  59. return data
  60. def process_all_rows(table):
  61. """Process all rows.
  62. Generator that processes all the rows in a table.
  63. """
  64. for row in iterate_over_rows(table):
  65. yield process_row(row)
  66. def find_number_of_pages(doc):
  67. """Find number of pages, and current page.
  68. Indexing starts at 0, so current_page-1 = the page number on the site.
  69. *Returns*
  70. A tuple with current_page, num_pages.
  71. """
  72. div = doc.xpath(_xpath_amount_of_pages)
  73. if len(div) != 1:
  74. raise exceptions.ElementNotFound("Div element not found on page")
  75. div = div[0]
  76. a_eles = div.findall('a')
  77. if len(a_eles) <= 0:
  78. raise exceptions.ElementNotFound("Incorrect div element found")
  79. a_eles = [a for a in a_eles if len(a.getchildren()) == 0]
  80. current_page = None
  81. num_pages = 1
  82. for index, element in enumerate(a_eles):
  83. if index+1 != int(element.text_content()):
  84. if current_page is None:
  85. current_page = int(element.text_content())-1
  86. num_pages += 1
  87. return (current_page, num_pages)