/scripts/downloadRadioArkivo.py
https://gitlab.com/capiscuas/bitarkivo · Python · 126 lines · 70 code · 26 blank · 30 comment · 11 complexity · dbc4cfb018ba5b7dc7fe80593e4a47ef MD5 · raw file
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #Archive.org
- from internetarchive import get_item
- from os import walk
- from os import path
- import re
- import json,codecs
- #urls = {'http://anno.onb.ac.at/cgi-content/anno-plus?aid=e0i&datum=1925&size=45'}
- import BeautifulSoup
- from urllib2 import urlopen
- prefix = 'wget "http://radioarkivo.com/arkivujo/rai-'
- #20040516
- import datetime
- date_1 = datetime.date(2004,06,19)
- #e(start_date, "%m/%d/%y")
- for i in range(500):
- date_1 = date_1 - datetime.timedelta(days=1)
- month = date_1.strftime('%m')
- year = date_1.strftime('%Y')
- day = date_1.strftime('%d')
- print prefix+year+month+day+".mp3"+'"'
- cc
- output = codecs.open("radioarkivo.txt", "w", "utf-8")
- for i in range(0,56):
- url = 'http://www.ameriko.org/eo/rhc/feed?page='+str(i)
- print url
- html = urlopen(url).read()
- soup = BeautifulSoup.BeautifulSoup(html)
- #print soup
- items = soup.findAll('item')
- for item in items:
- try:
- mp3 = item.enclosure['url']
- print mp3
- output.write('wget '+mp3)
- except:
- print 'No ENCLOSURE'
- print item
-
- output.close()
- b
- #print soup
- #boccat = soup.find("dl", "boccat")
- def getText(url):
- #print url
- url = 'https://web.archive.org' + url
- #html = urlopen(url).read()
- print url
- #soup = BeautifulSoup.BeautifulSoup(html)
- #print soup
- #reg = re.compile(r'Temo:')
- #elements = [e for e in soup.find_all('a') if reg.match(e.text)]
- #print elements
- return url
- for i in range(1,2):
- output.write('\n')
- output.write('\n#-----------------------------')
- url = 'https://web.archive.org/web/20070224064915/http://www.radioarkivo.org/plenlisto.php/'+str(1)
- print url
- html = urlopen(url).read()
- soup = BeautifulSoup.BeautifulSoup(html)
- span = soup.find("span","forta")
- titles = span.parent
- table = span.parent.parent.parent
- _alltr = table.findAll("tr")
-
- for tr in _alltr:
- _alltd = tr.findAll("td")
- if len(_alltd) == 4:
- date, link, lenght, priskribo = _alltd
- #print _alltd
- #print priskribo.text
- if '(plu)' in priskribo.text:
- s = str(priskribo)
- match = re.search(r'href=[\'"]?([^\'" >]+)', s)
- if match:
- legu_link = match.group(1)
- #print legu_link
- priskribo = getText(legu_link)
- else:
- priskribo = priskribo.text
- print date.text,'|', link.text,'|', lenght.text,'|', priskribo
- output.write('#'+date.text)
- output.write('\n')
- output.write('#'+priskribo)
- output.write('\n')
- output.write('#'+lenght.text)
- output.write('\n')
- output.write('wget '+link.text)
-
- output.close()
- #print 'results', td
- #albumyears = []
- #for result in td:
- #print result
- #fields = result.th.findAll("td")
- #print fields
- #
- #albumyears.append(result.a.text)
- #print "years = ["+",".join(albumyears) + "]"
- #return albumyears
- #label = soup.find("label",page_name)