/scripts/downloadRadioArkivo.py

https://gitlab.com/capiscuas/bitarkivo · Python · 126 lines · 70 code · 26 blank · 30 comment · 11 complexity · dbc4cfb018ba5b7dc7fe80593e4a47ef MD5 · raw file

  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #Archive.org
  4. from internetarchive import get_item
  5. from os import walk
  6. from os import path
  7. import re
  8. import json,codecs
  9. #urls = {'http://anno.onb.ac.at/cgi-content/anno-plus?aid=e0i&datum=1925&size=45'}
  10. import BeautifulSoup
  11. from urllib2 import urlopen
  12. prefix = 'wget "http://radioarkivo.com/arkivujo/rai-'
  13. #20040516
  14. import datetime
  15. date_1 = datetime.date(2004,06,19)
  16. #e(start_date, "%m/%d/%y")
  17. for i in range(500):
  18. date_1 = date_1 - datetime.timedelta(days=1)
  19. month = date_1.strftime('%m')
  20. year = date_1.strftime('%Y')
  21. day = date_1.strftime('%d')
  22. print prefix+year+month+day+".mp3"+'"'
  23. cc
  24. output = codecs.open("radioarkivo.txt", "w", "utf-8")
  25. for i in range(0,56):
  26. url = 'http://www.ameriko.org/eo/rhc/feed?page='+str(i)
  27. print url
  28. html = urlopen(url).read()
  29. soup = BeautifulSoup.BeautifulSoup(html)
  30. #print soup
  31. items = soup.findAll('item')
  32. for item in items:
  33. try:
  34. mp3 = item.enclosure['url']
  35. print mp3
  36. output.write('wget '+mp3)
  37. except:
  38. print 'No ENCLOSURE'
  39. print item
  40. output.close()
  41. b
  42. #print soup
  43. #boccat = soup.find("dl", "boccat")
  44. def getText(url):
  45. #print url
  46. url = 'https://web.archive.org' + url
  47. #html = urlopen(url).read()
  48. print url
  49. #soup = BeautifulSoup.BeautifulSoup(html)
  50. #print soup
  51. #reg = re.compile(r'Temo:')
  52. #elements = [e for e in soup.find_all('a') if reg.match(e.text)]
  53. #print elements
  54. return url
  55. for i in range(1,2):
  56. output.write('\n')
  57. output.write('\n#-----------------------------')
  58. url = 'https://web.archive.org/web/20070224064915/http://www.radioarkivo.org/plenlisto.php/'+str(1)
  59. print url
  60. html = urlopen(url).read()
  61. soup = BeautifulSoup.BeautifulSoup(html)
  62. span = soup.find("span","forta")
  63. titles = span.parent
  64. table = span.parent.parent.parent
  65. _alltr = table.findAll("tr")
  66. for tr in _alltr:
  67. _alltd = tr.findAll("td")
  68. if len(_alltd) == 4:
  69. date, link, lenght, priskribo = _alltd
  70. #print _alltd
  71. #print priskribo.text
  72. if '(plu)' in priskribo.text:
  73. s = str(priskribo)
  74. match = re.search(r'href=[\'"]?([^\'" >]+)', s)
  75. if match:
  76. legu_link = match.group(1)
  77. #print legu_link
  78. priskribo = getText(legu_link)
  79. else:
  80. priskribo = priskribo.text
  81. print date.text,'|', link.text,'|', lenght.text,'|', priskribo
  82. output.write('#'+date.text)
  83. output.write('\n')
  84. output.write('#'+priskribo)
  85. output.write('\n')
  86. output.write('#'+lenght.text)
  87. output.write('\n')
  88. output.write('wget '+link.text)
  89. output.close()
  90. #print 'results', td
  91. #albumyears = []
  92. #for result in td:
  93. #print result
  94. #fields = result.th.findAll("td")
  95. #print fields
  96. #
  97. #albumyears.append(result.a.text)
  98. #print "years = ["+",".join(albumyears) + "]"
  99. #return albumyears
  100. #label = soup.find("label",page_name)