PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/yatvgrabber/grabtv.py

http://yatvgrabber.googlecode.com/
Python | 780 lines | 748 code | 17 blank | 15 comment | 10 complexity | e88af56a65a3b444bb3c0795ee45952b MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # standart libraries
  4. import os
  5. import re
  6. import sys
  7. import codecs
  8. import signal
  9. import string
  10. import urllib
  11. #import logging
  12. import datetime
  13. import subprocess
  14. from random import choice
  15. from multiprocessing import Pool
  16. # third party libraries
  17. import argparse
  18. from configobj import ConfigObj
  19. def main():
  20. # argument parsing
  21. ArgumentParser.parseArguments()
  22. # override the urllib useragent - to get the custom user agent
  23. urllib._urlopener = AppOpener()
  24. # run the pre grab step
  25. preGrabCleanUp()
  26. # read / set the grabber configuration file
  27. tmpConfigFile = ArgumentParser.args.configfile
  28. grabConf = ConfigObj(tmpConfigFile, raise_errors=True)
  29. if not os.path.isfile(tmpConfigFile):
  30. # write the default configuration
  31. grabConf['page'] = ['http://www.tvtv.ch',
  32. 'http://www.tvtv.de',
  33. 'http://www.tvtv.at',
  34. 'http://www.tvtv.co.uk',
  35. 'http://www.tvtv.fr',
  36. 'http://www.tvtv.it']
  37. try:
  38. grabConf.write()
  39. except:
  40. print 'unable the write config file: %s' % \
  41. ArgumentParser.args.configfile
  42. sys.exit(-1)
  43. # execute the configure mode
  44. tmpChannelFile = ArgumentParser.args.channelfile
  45. if ArgumentParser.args.configure:
  46. tmpChannelList = {}
  47. [tmpChannelList.update(parseChannelList(page))
  48. for page in reversed(grabConf['page'])]
  49. try:
  50. channelfile = codecs.open(tmpChannelFile, 'w', 'utf-8')
  51. tmpList = []
  52. [tmpList.append('%s#%s\n' % (channelid, tmpChannelList[channelid]))
  53. for channelid in sorted(tmpChannelList.keys())]
  54. channelfile.write(string.joinfields(tmpList, ''))
  55. channelfile.close()
  56. except:
  57. print 'error the writing channel file: %s' % tmpChannelFile
  58. sys.exit(-1)
  59. print 'channel file successfully written, file: %s' % tmpChannelFile
  60. sys.exit(0)
  61. # normal grabbing workflow
  62. # fill the channel list
  63. tmpChanList = {}
  64. lstrip = string.strip
  65. lsplit = string.split
  66. for line in open(tmpChannelFile, 'r'):
  67. if lstrip(line) == '':
  68. continue
  69. try:
  70. (chanid, name) = lsplit(line, '#')
  71. tmpChanList[chanid] = lstrip(name)
  72. except:
  73. print 'error reading channel configuration, line: %s' % line
  74. DataStorage.channelList = tmpChanList
  75. # get the program data
  76. parseChannelData(grabConf['page'][0], ArgumentParser.args.days)
  77. # post grab cleanup - do not cleanup after process locally
  78. if not ArgumentParser.args.local:
  79. postGrabCleanUp()
  80. class DataStorage():
  81. channelList = {}
  82. xmlDataFile = None
  83. class ArgumentParser():
  84. @staticmethod
  85. def parseArguments():
  86. parser = argparse.ArgumentParser(
  87. description='YaTvGrabber, XMLTV grabbing script',
  88. epilog='Copyright (C) [2012] [keller.eric, lars.schmohl]',
  89. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  90. parser.add_argument('--days', type=int, choices=range(1, 22),
  91. default=21,
  92. help='days to grab')
  93. parser.add_argument('--outputfile', type=str,
  94. default='tvtv.xml',
  95. help='output file with the xmltv data')
  96. parser.add_argument('--configure', action='store_true',
  97. default=False,
  98. help='get all channels and create \
  99. the channel file (normal grabbing is disabled)')
  100. parser.add_argument('--configfile', type=str,
  101. default='/etc/yatvgrabber/grab.conf',
  102. help='configuration file for the grabber')
  103. parser.add_argument('--channelfile', type=str,
  104. default='/etc/yatvgrabber/channel.grab',
  105. help='channel file for the grabber')
  106. parser.add_argument('--cachedir', type=str,
  107. default='/var/cache/yatvgrabber',
  108. help='cache directory for the grabber')
  109. parser.add_argument('--local', action='store_true',
  110. default=False,
  111. help='process only the local stored cache files')
  112. # parse the arguments
  113. ArgumentParser.args = parser.parse_args()
  114. ArgumentParser.args.outputfile = \
  115. os.path.realpath(ArgumentParser.args.outputfile)
  116. ArgumentParser.args.configfile = \
  117. os.path.realpath(ArgumentParser.args.configfile)
  118. ArgumentParser.args.channelfile = \
  119. os.path.realpath(ArgumentParser.args.channelfile)
  120. ArgumentParser.args.cachedir = \
  121. os.path.realpath(ArgumentParser.args.cachedir)
  122. def preGrabCleanUp():
  123. # create the config dir if needed
  124. tmpConfigFile = ArgumentParser.args.configfile
  125. if not os.path.isfile(tmpConfigFile):
  126. configdir = os.path.dirname(tmpConfigFile)
  127. if not os.path.isdir(configdir) and configdir != '':
  128. os.makedirs(configdir)
  129. # create the channel dir if needed
  130. tmpChannelFile = ArgumentParser.args.channelfile
  131. if not os.path.isfile(tmpChannelFile):
  132. channeldir = os.path.dirname(tmpChannelFile)
  133. if not os.path.isdir(channeldir) and channeldir != '':
  134. os.makedirs(channeldir)
  135. tmpCacheDir = ArgumentParser.args.cachedir
  136. if not os.path.isdir(tmpCacheDir) and tmpCacheDir != '':
  137. # create the cache dir if needed
  138. os.makedirs(tmpCacheDir)
  139. else:
  140. # cleanup the grabbed files - just the empty files
  141. subprocess.call('find %s -type f -empty -not -name ".*" \
  142. -exec rm -f \'{}\' +' % tmpCacheDir, shell=True)
  143. def postGrabCleanUp():
  144. tmpCacheDir = ArgumentParser.args.cachedir
  145. # cleanup the grabbed files - just the empty files
  146. subprocess.call('find %s -type f -empty -not -name ".*" \
  147. -exec rm -f \'{}\' +' % tmpCacheDir, shell=True)
  148. # cleanup the grabbed files - files which are not used anymore
  149. subprocess.call('find %s -type f -atime +1 -not -name ".*" \
  150. -exec rm -f \'{}\' +' % tmpCacheDir, shell=True)
  151. class AppOpener(urllib.FancyURLopener):
  152. user_agents = ['Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)',
  153. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
  154. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2',
  155. 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:11.0) Gecko/20100101 Firefox/11.0']
  156. version = choice(user_agents)
  157. def getOverviewPage(base_url):
  158. filename = '%s/%s.html' % (ArgumentParser.args.cachedir,
  159. (base_url.split('/')[-1]).strip())
  160. try:
  161. if not ArgumentParser.args.local:
  162. # always retrieve the overview page in none local mode
  163. urllib.urlretrieve(base_url, filename)
  164. if not os.path.isfile(filename):
  165. raise Warning(filename)
  166. except:
  167. print 'error retrieve / open file: %s' % filename
  168. return ''
  169. return open(filename, 'r').read().decode('utf-8')
  170. def getAdditionalPage(base_url):
  171. filename = '%s/%s.additional.html' % (ArgumentParser.args.cachedir,
  172. (base_url.split('/')[-1]).strip())
  173. try:
  174. if not ArgumentParser.args.local:
  175. # always retrieve the additional page in none local mode
  176. urllib.urlretrieve('%s/tvtv/index.vm?mainTemplate=web%%2FadditionalChannelsSelection.vm' % base_url, filename)
  177. if not os.path.isfile(filename):
  178. raise Warning(filename)
  179. except:
  180. print 'error retrieve / open file: %s' % filename
  181. return ''
  182. return open(filename, 'r').read().decode('utf-8')
  183. def getWeekDayPage(base_url, week, day, channelId):
  184. print 'grabbing %s week %s day %s channelid %s ' % \
  185. (base_url, week, day, channelId)
  186. filename = '%s/week=%s-day=%s-channel=%s.html' % \
  187. (ArgumentParser.args.cachedir, week, day, channelId)
  188. if day > -1:
  189. # always retrieve the day page in none local mode
  190. grabUrl = '%s/tvtv/index.vm?weekId=%s&dayId=%s&chnl=%s' % \
  191. (base_url, week, day, channelId)
  192. else:
  193. # use channelWeek to get the hole week for one channel
  194. grabUrl = '%s/tvtv/index.vm?weekId=%s&dayId=0&weekChannel=%s' % \
  195. (base_url, week, channelId)
  196. try:
  197. if not ArgumentParser.args.local:
  198. urllib.urlretrieve(grabUrl, filename)
  199. if not os.path.isfile(filename):
  200. raise Warning(filename)
  201. except:
  202. print 'error retrieve / open file: %s' % filename
  203. return ''
  204. return filename
  205. def getProgramPage(base_url, programId):
  206. # use the page from cache if available
  207. filename = '%s/%s.html' % (ArgumentParser.args.cachedir, programId)
  208. try:
  209. # always cached the program page if available
  210. if not ArgumentParser.args.local and not os.path.isfile(filename):
  211. urllib.urlretrieve('%s/tvtv/web/programdetails.vm?programmeId=%s' % \
  212. (base_url, programId), filename)
  213. if not os.path.isfile(filename):
  214. raise Warning(filename)
  215. except:
  216. print 'error retrieve / open file: %s' % filename
  217. return ''
  218. return filename
  219. def parseChannelList(pagename):
  220. channellist = {}
  221. # parse the main page
  222. for line in getOverviewPage(pagename).split('\n'):
  223. for foundId in RegExStorage.regExChannelId1.findall(line):
  224. for foundName in RegExStorage.regExChannelName.findall(line):
  225. channellist[foundId] = '%s (%s)' % (foundName, pagename)
  226. # additional page page
  227. for line in getAdditionalPage(pagename).split('\n'):
  228. for foundId in RegExStorage.regExChannelId2.findall(line):
  229. channellist[foundId] = '%s (%s)' % ((line.split('>')[-1]).strip(),
  230. pagename)
  231. return channellist
  232. def parseChannelData(pagename, days):
  233. grabPlan = []
  234. weeksToGrab = days // 7
  235. leftoverDaysToGrab = days % 7
  236. if weeksToGrab > 0:
  237. for weekno in range(0, weeksToGrab):
  238. grabPlan.append([weekno, -1])
  239. if leftoverDaysToGrab > 0:
  240. for dayno in range(0, leftoverDaysToGrab):
  241. grabPlan.append([weeksToGrab, dayno])
  242. # multiprocessing
  243. pool = Pool(processes=None, initializer=initializeProcess)
  244. resultsList = []
  245. for entry in grabPlan:
  246. for channelId in DataStorage.channelList.keys():
  247. pageFileName = getWeekDayPage(pagename, entry[0], entry[1],
  248. channelId)
  249. resultsList.append(pool.apply_async(processChannelPage,
  250. (pageFileName,)))
  251. # open the output file
  252. try:
  253. DataStorage.xmlDataFile = codecs.open(
  254. ArgumentParser.args.outputfile, 'w', 'utf-8')
  255. except:
  256. print 'error open outputfile, file: ' + ArgumentParser.args.outputfile
  257. sys.exit(-1)
  258. # write header
  259. tmpData = []
  260. tmpData.append('<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n')
  261. tmpData.append('<!DOCTYPE tv SYSTEM "xmltv.dtd">\n')
  262. tmpData.append('<tv source-info-url="%s" source-info-name="tvtv" generator-info-url="http://code.google.com/p/yatvgrabber/" generator-info-name="yatvgrabber">\n' % pagename)
  263. # list the channels
  264. for channelid in DataStorage.channelList.keys():
  265. tmpData.append(' <channel id="%s">\n' % channelid)
  266. tmpData.append(' <display-name>%s</display-name>\n' % \
  267. DataStorage.channelList[channelid].decode('latin1'))
  268. tmpData.append(' </channel>\n')
  269. DataStorage.xmlDataFile.write(string.joinfields(tmpData, ''))
  270. # collect the results
  271. programIdList = []
  272. [programIdList.extend(tmpResults.get(timeout=10))
  273. for tmpResults in resultsList]
  274. #program page getting loop
  275. totalProgrammeIds = len(programIdList)
  276. stepProgrammeIds = 0
  277. tmpTime1 = datetime.datetime(2012, 1, 1)
  278. for programId in programIdList:
  279. #debug output
  280. tmpTime2 = datetime.datetime.today()
  281. if (tmpTime2 > (tmpTime1 + datetime.timedelta(minutes=5))):
  282. print "[%s] progress: %s of %s program pages" % \
  283. (tmpTime2.strftime('%Y-%m-%d %H:%M:%S'),
  284. stepProgrammeIds,
  285. totalProgrammeIds)
  286. tmpTime1 = tmpTime2
  287. # get the program page
  288. programFileName = getProgramPage(pagename, programId)
  289. # pass the filename to the process for parsing
  290. if programFileName != '':
  291. pool.apply_async(processProgramPage,
  292. (programId, programFileName,),
  293. callback=contentInjectCallback)
  294. #retValue = processProgramPage(programId, programFileName)
  295. #contentInjectCallback(retValue)
  296. stepProgrammeIds += 1
  297. pool.close()
  298. pool.join()
  299. DataStorage.xmlDataFile.write('</tv>\n')
  300. DataStorage.xmlDataFile.close()
  301. print 'xmltv file successfully written, file: %s' % \
  302. ArgumentParser.args.outputfile
  303. def processChannelPage(filename):
  304. tmpData = ''
  305. try:
  306. tmpData = open(filename, 'r').read().decode('utf-8')
  307. except:
  308. return []
  309. return RegExStorage.regExProgramId.findall(tmpData)
  310. def contentInjectCallback(programEntry):
  311. llen = len
  312. for programid in programEntry.keys():
  313. if 0 == llen(programEntry[programid]):
  314. print 'parsing error of programid %s' % programid
  315. continue
  316. # get the programme data
  317. pdata = programEntry[programid]
  318. # check min requirements
  319. if 'start' not in pdata or \
  320. 'channel' not in pdata or \
  321. 'title' not in pdata:
  322. print 'minimal required data not available of programid %s' % \
  323. programid
  324. continue
  325. if pdata['start'] == '' or \
  326. pdata['channel'] == '' or \
  327. pdata['title'] == '':
  328. print 'minimal required data not available of programid %s' % \
  329. programid
  330. continue
  331. tmpData = []
  332. # concat the programme tag
  333. tmpData.append(' <programme start="%s" ' % pdata['start'])
  334. if 'finish' in pdata and pdata['finish'] != '':
  335. tmpData.append('finish="%s" ' % pdata['finish'])
  336. tmpData.append('channel="%s">\n' % pdata['channel'])
  337. # write the title
  338. if 'title' in pdata:
  339. for tmpLang in pdata['title'].keys():
  340. if pdata['title'][tmpLang] != '':
  341. tmpData.append(' <title lang="%s">%s</title>\n' % \
  342. (tmpLang, pdata['title'][tmpLang]))
  343. # write the sub-title
  344. if 'sub-title' in pdata:
  345. for tmpLang in pdata['sub-title'].keys():
  346. if pdata['sub-title'][tmpLang] != '':
  347. tmpData.append(' <sub-title lang="%s">%s</sub-title>\n' % \
  348. (tmpLang, pdata['sub-title'][tmpLang]))
  349. # write the description
  350. if 'desc' in pdata:
  351. for tmpLang in pdata['desc'].keys():
  352. if pdata['desc'][tmpLang] != '':
  353. tmpData.append(' <desc lang="%s">%s</desc>\n' % \
  354. (tmpLang, pdata['desc'][tmpLang]))
  355. tmpCredits = []
  356. # director
  357. if 'director' in pdata:
  358. for tmpDirector in pdata['director']:
  359. if tmpDirector != '':
  360. tmpCredits.append(' <director>%s</director>\n' % \
  361. tmpDirector)
  362. # actors
  363. if 'actor' in pdata:
  364. for tmpActorData in pdata['actor']:
  365. for tmpActor in tmpActorData.keys():
  366. if tmpActor != '':
  367. if tmpActorData[tmpActor] == '':
  368. tmpCredits.append(' <actor>%s</actor>\n' % \
  369. tmpActor)
  370. else:
  371. tmpCredits.append(' <actor role="%s">%s</actor>\n' % \
  372. (tmpActorData[tmpActor],
  373. tmpActor))
  374. # writer
  375. if 'writer' in pdata:
  376. for tmpWriter in pdata['writer']:
  377. if tmpWriter != '':
  378. tmpCredits.append(' <writer>%s</writer>\n' % \
  379. tmpWriter)
  380. # adapter
  381. if 'adapter' in pdata:
  382. for tmpAdapter in pdata['adapter']:
  383. if tmpAdapter != '':
  384. tmpCredits.append(' <adapter>%s</adapter>\n' % \
  385. tmpAdapter)
  386. # producer
  387. if 'producer' in pdata:
  388. for tmpProducer in pdata['producer']:
  389. if tmpProducer != '':
  390. tmpCredits.append(' <producer>%s</producer>\n' % \
  391. tmpProducer)
  392. # composer
  393. if 'composer' in pdata:
  394. for tmpComposer in pdata['composer']:
  395. if tmpComposer != '':
  396. tmpCredits.append(' <composer>%s</composer>\n' % \
  397. tmpComposer)
  398. # editor
  399. if 'editor' in pdata:
  400. for tmpEditor in pdata['editor']:
  401. if tmpEditor != '':
  402. tmpCredits.append(' <editor>%s</editor>\n' % \
  403. tmpEditor)
  404. # presenter
  405. if 'presenter' in pdata:
  406. for tmpPresenter in pdata['presenter']:
  407. if tmpPresenter != '':
  408. tmpCredits.append(' <presenter>%s</presenter>\n' % \
  409. tmpPresenter)
  410. # commentator
  411. if 'commentator' in pdata:
  412. for tmpCommentator in pdata['commentator']:
  413. if tmpCommentator != '':
  414. tmpCredits.append(' <commentator>%s</commentator>\n' % \
  415. tmpCommentator)
  416. # guest
  417. if 'guest' in pdata:
  418. for tmpGuest in pdata['guest']:
  419. if tmpGuest != '':
  420. tmpCredits.append(' <guest>%s</guest>\n' % tmpGuest)
  421. # write the credits
  422. if llen(tmpCredits) > 0:
  423. tmpData.append(' <credits>\n')
  424. tmpData.extend(tmpCredits)
  425. tmpData.append(' </credits>\n')
  426. # production date
  427. if 'date' in pdata and pdata['date'] != '':
  428. tmpData.append(' <date>%s</date>\n' % pdata['date'])
  429. # category
  430. if 'category' in pdata:
  431. for tmpLang in pdata['category']:
  432. for tmpCategory in pdata['category'][tmpLang]:
  433. if tmpCategory != '':
  434. tmpData.append(' <category lang="%s">%s</category>\n' % \
  435. (tmpLang, tmpCategory))
  436. # language
  437. if 'language' in pdata:
  438. for tmpLang in pdata['language']:
  439. for tmpLanguage in pdata['language'][tmpLang]:
  440. if tmpLanguage != '':
  441. tmpData.append(' <language lang="%s">%s</language>\n' % \
  442. (tmpLang, tmpLanguage))
  443. # orig-language
  444. if 'orig-language' in pdata:
  445. for tmpLang in pdata['orig-language']:
  446. for tmpOrigLanguage in pdata['orig-language'][tmpLang]:
  447. if tmpOrigLanguage != '':
  448. tmpData.append(' <orig-language lang="%s">%s</orig-language>\n' % \
  449. (tmpLang, tmpOrigLanguage))
  450. # length
  451. if 'length' in pdata:
  452. for tmpUnits in pdata['length']:
  453. for tmpValue in pdata['length'][tmpUnits]:
  454. if tmpValue != '':
  455. tmpData.append(' <length units="%s">%s</length>\n' % \
  456. (tmpUnits, tmpValue))
  457. # icon
  458. if 'icon' in pdata and llen(pdata['icon']) > 0:
  459. tmpIcon = []
  460. if 'src' in pdata['icon'] and pdata['icon']['src'] != '':
  461. tmpIcon = [' <icon src="%s"' % pdata['icon']['src']]
  462. if 'width' in pdata['icon'] and pdata['icon']['width'] != '':
  463. tmpIcon.append(' width="%s"' % pdata['icon']['width'])
  464. if 'height' in pdata['icon'] and pdata['icon']['height'] != '':
  465. tmpIcon.append(' height="%s"' % pdata['icon']['height'])
  466. tmpIcon.append(' />\n')
  467. if llen(tmpIcon) > 0:
  468. tmpData.extend(tmpIcon)
  469. # country
  470. if 'country' in pdata:
  471. for tmpLang in pdata['country']:
  472. for tmpCountry in pdata['country'][tmpLang]:
  473. if tmpCountry != '':
  474. tmpData.append(' <country lang="%s">%s</country>\n' % \
  475. (tmpLang, tmpCountry))
  476. # episode numbers
  477. if 'episode-num' in pdata:
  478. for tmpSystem in pdata['episode-num']:
  479. if pdata['episode-num'][tmpSystem] != '':
  480. tmpData.append(' <episode-num system="%s">%s</episode-num>\n' % \
  481. (tmpSystem, pdata['episode-num'][tmpSystem]))
  482. # video quality
  483. if "HD" in DataStorage.channelList[pdata['channel']]:
  484. tmpData.append(' <video>\n')
  485. tmpData.append(' <quality>HDTV</quality>\n')
  486. tmpData.append(' </video>\n')
  487. # regExRating
  488. if 'rating' in pdata:
  489. for tmpSystem in pdata['rating']:
  490. if pdata['rating'][tmpSystem] != '':
  491. tmpData.append(' <regExRating system="%s">\n' % tmpSystem)
  492. tmpData.append(' <value>%s</value>\n' % \
  493. pdata['rating'][tmpSystem])
  494. tmpData.append(' </regExRating>\n')
  495. # end programme tag
  496. tmpData.append(' </programme>\n')
  497. DataStorage.xmlDataFile.write(string.joinfields(tmpData, ''))
  498. def initializeProcess():
  499. # ignore sig int so the main process can be interrupted
  500. signal.signal(signal.SIGINT, signal.SIG_IGN)
  501. def processProgramPage(programId, filename):
  502. os.utime(filename, None) # "touch" the file
  503. programPage = open(filename, 'r').read().decode('utf-8')
  504. programData = {programId: {}}
  505. # min data found?
  506. try:
  507. if RegExStorage.regExChannelId3.search(programPage) == None:
  508. raise Warning(programId)
  509. if RegExStorage.regExDate.search(programPage) == None:
  510. raise Warning(programId)
  511. if RegExStorage.regExStart.search(programPage) == None:
  512. raise Warning(programId)
  513. except:
  514. os.remove(filename)
  515. return programData
  516. # get the channel id from the page
  517. for foundStr in RegExStorage.regExChannelId3.findall(programPage):
  518. tempStr = CleanFromTags(foundStr)
  519. if tempStr != '':
  520. programData[programId]['channel'] = tempStr
  521. # get the title from the page
  522. for foundStr in RegExStorage.regExTitle.findall(programPage):
  523. tempStr = CleanFromTags(foundStr)
  524. if tempStr != '':
  525. programData[programId]['title'] = {'de': tempStr}
  526. # production year
  527. for foundStr in RegExStorage.regExProductionYear.findall(programPage):
  528. programData[programId]['date'] = CleanFromTags(foundStr)
  529. #cut down the content
  530. try:
  531. programPage = programPage.split(r'class="program-content"')[1]
  532. programPage = programPage.split(r'class="list_detail"')[0]
  533. except:
  534. os.remove(filename)
  535. return {programId: {}}
  536. # date
  537. date = ''
  538. for foundStr in RegExStorage.regExDate.findall(programPage):
  539. date = foundStr.strip()
  540. # start date
  541. startdate = datetime.datetime(2012, 1, 1)
  542. for foundStr in RegExStorage.regExStart.findall(programPage):
  543. try:
  544. (day, month, year) = date.split('.')
  545. (starthour, startminute) = foundStr.strip().split(':')
  546. startdate = datetime.datetime(int(year), int(month), int(day),
  547. int(starthour), int(startminute))
  548. programData[programId]['start'] = startdate.strftime('%Y%m%d%H%M')
  549. except:
  550. os.remove(filename)
  551. return {programId: {}}
  552. # stop date
  553. for foundStr in RegExStorage.regExStop.findall(programPage):
  554. try:
  555. (endhour, endminute) = foundStr.strip().split(':')
  556. enddate = datetime.datetime(int(year), int(month), int(day),
  557. int(endhour), int(endminute))
  558. if startdate > enddate:
  559. # program ends next day
  560. enddate = enddate + datetime.timedelta(days=1)
  561. programData[programId]['finish'] = enddate.strftime('%Y%m%d%H%M')
  562. except:
  563. pass # optional data
  564. # original title
  565. for foundStr in RegExStorage.regExOrgTitle.findall(programPage):
  566. tmpTitle = {'de': CleanFromTags(foundStr)}
  567. if 'title' in programData[programId] and \
  568. programData[programId]['title']['de'] != '':
  569. if tmpTitle['de'] in programData[programId]['title']['de']:
  570. # org tile found, just use the title
  571. tmpTitle['de'] = programData[programId]['title']['de']
  572. else:
  573. # org title in title not found - concat the titles
  574. tmpTitle['de'] = "%s - %s" % \
  575. (tmpTitle['de'], programData[programId]['title']['de'])
  576. programData[programId]['title'] = tmpTitle
  577. if 'title' not in programData[programId] or \
  578. programData[programId]['title']['de'] == '':
  579. os.remove(filename)
  580. return {programId: {}}
  581. # sub-title
  582. for foundStr in RegExStorage.regExSubtitle.findall(programPage):
  583. programData[programId]['sub-title'] = {'de': CleanFromTags(foundStr)}
  584. # description
  585. for foundStr in RegExStorage.regExDescription.findall(programPage):
  586. programData[programId]['desc'] = {'de': CleanFromTags(foundStr)}
  587. # director
  588. for foundStr in RegExStorage.regExDirector.findall(programPage):
  589. programData[programId]['director'] = []
  590. for tmpDirector in CleanFromTags(foundStr).split(','):
  591. programData[programId]['director'].append(tmpDirector.strip())
  592. # actors
  593. for foundStr in RegExStorage.regExActors.findall(programPage):
  594. programData[programId]['actor'] = []
  595. for tmpActor in CleanFromTags(foundStr).split(','):
  596. try:
  597. (fActor, fRole) = tmpActor.strip().split('(', 1)
  598. programData[programId]['actor'].append({fActor.strip(): fRole.strip(') ')})
  599. except:
  600. programData[programId]['actor'].append({tmpActor.strip(): ''})
  601. # producer
  602. for foundStr in RegExStorage.regExProducer.findall(programPage):
  603. programData[programId]['producer'] = []
  604. for tmpProducer in CleanFromTags(foundStr).split(','):
  605. programData[programId]['producer'].append(tmpProducer.strip())
  606. # writer
  607. for foundStr in RegExStorage.regExWriter.findall(programPage):
  608. programData[programId]['writer'] = []
  609. for tmpAuthor in CleanFromTags(foundStr).split(','):
  610. programData[programId]['writer'].append(tmpAuthor.strip())
  611. # presenter
  612. for foundStr in RegExStorage.regExPresenter.findall(programPage):
  613. programData[programId]['presenter'] = []
  614. for tmpPresenter in CleanFromTags(foundStr).split(','):
  615. programData[programId]['presenter'].append(tmpPresenter.strip())
  616. # category
  617. for foundStr in RegExStorage.regExCategory.findall(programPage):
  618. programData[programId]['category'] = {'de': []}
  619. for tmpCategory in CleanFromTags(foundStr).split(','):
  620. programData[programId]['category']['de'].append(tmpCategory.strip())
  621. # country
  622. for foundStr in RegExStorage.regExCountry.findall(programPage):
  623. programData[programId]['country'] = {'de': []}
  624. for tmpCountry in CleanFromTags(foundStr).split(','):
  625. programData[programId]['country']['de'].append(tmpCountry.strip())
  626. # episode in format xmltv_ns
  627. episodeString = ''
  628. for foundStr in RegExStorage.regExEpisode.findall(programPage):
  629. episodeString = '%s %s' % (episodeString, foundStr.strip())
  630. episodeString = episodeString.strip()
  631. if episodeString != '':
  632. tempstr = ''
  633. for tmpSeason in RegExStorage.regExSeason.findall(episodeString):
  634. tempstr = str(int(tmpSeason) - 1)
  635. tempstr = '%s.' % tempstr
  636. for tmpEpisode in RegExStorage.regExEpisodeNum.findall(episodeString):
  637. tempstr = '%s%s' % (tempstr, int(tmpEpisode) - 1)
  638. for tmpEpisodeTotal in RegExStorage.regExEpisodeTotal.findall(episodeString):
  639. tempstr = '%s/%s' % (tempstr, tmpEpisodeTotal)
  640. if tempstr != ".":
  641. programData[programId]['episode-num'] = {'xmltv_ns': '%s.' % tempstr}
  642. # kid protection
  643. for foundStr in RegExStorage.regExRating.findall(programPage):
  644. programData[programId]['rating'] = {'FSK': CleanFromTags(foundStr)}
  645. return programData
  646. class RegExStorage():
  647. # for the configuration workflow
  648. regExChannelId1 = re.compile(r'weekChannel=([0-9]+)"')
  649. regExChannelName = re.compile(r'class="">(.*)<')
  650. regExChannelId2 = re.compile(r'channelLogo=([0-9]+)"')
  651. # for the grab workflow
  652. regExProgramId = re.compile(r'programmeId=([0-9]+)')
  653. regExChannelId3 = re.compile(r's.prop16="[^\(]+\(([0-9]+)\)"')
  654. regExTitle = re.compile(r's.prop5="(.*)\[[0-9]+\]"')
  655. regExSubtitle = re.compile(r'<span class="fb-b9">(.*?)</span>')
  656. regExEpisode = re.compile(r'<span class="fn-b9">(.*?)</span>')
  657. regExProductionYear = re.compile(r'<td class="fb-b9 trailing">.*?([0-9]{4}).*?</td>', re.DOTALL)
  658. regExDescription = re.compile(r'<span class="fn-b10">(.*?)</span>', re.DOTALL)
  659. regExDate = re.compile(r'>[^<]+([0-9]{2}\.[0-9]{2}\.[0-9]{4})<')
  660. regExStart = re.compile(r'>Beginn: ([0-9]{2}:[0-9]{2}) Uhr<')
  661. regExStop = re.compile(r'>Ende: ([0-9]{2}:[0-9]{2}) Uhr<')
  662. regExActors = re.compile(r'>Darsteller:</td>(.+?)</tr>', re.DOTALL)
  663. regExProducer = re.compile(r'>Produktion:</td>(.+?)</tr>', re.DOTALL)
  664. regExDirector = re.compile(r'>Regie:</td>(.+?)</tr>', re.DOTALL)
  665. regExWriter = re.compile(r'>Autor:</td>(.+?)</tr>', re.DOTALL)
  666. regExRating = re.compile(r'>FSK:</td>.*: ([0-9]+).*</tr>', re.DOTALL)
  667. regExCategory = re.compile(r'>Kategorie:</td>(.+?)</tr>', re.DOTALL)
  668. regExCountry = re.compile(r'>Land:</td>(.+?)</tr>', re.DOTALL)
  669. regExSeason = re.compile(r'Staffel ([0-9]+)')
  670. regExEpisodeNum = re.compile(r'Folge ([0-9]+)')
  671. regExEpisodeTotal = re.compile(r'Folge [0-9]+/([0-9]+)')
  672. regExOrgTitle = re.compile(r'>Orginaltitel:</td>(.+?)</tr>', re.DOTALL)
  673. regExPresenter = re.compile(r'sentiert von:</td>(.+?)</tr>', re.DOTALL)
  674. # treat special chars and words
  675. charSpecial = {1: [re.compile(r'<[^>]*>'), r' '],
  676. 2: [re.compile(r'&nbsp;'), r' '],
  677. 3: [re.compile(r'\(Wiederholung\)'), r''],
  678. 4: [re.compile(r'^Reihe: .+'), r''],
  679. 5: [re.compile(r'&'), r'&amp;'],
  680. 6: [re.compile(r'"'), r'&quot;'],
  681. 97: [re.compile(r'c\<t'), r'c\'t'],
  682. 98: [re.compile(r'[\n\t ]+', re.DOTALL), r' '],
  683. 99: [re.compile(r',$'), r'']}
  684. def CleanFromTags(inputStr):
  685. retStr = inputStr
  686. for key in sorted(RegExStorage.charSpecial.keys()):
  687. # clean all special characters
  688. retStr = RegExStorage.charSpecial[key][0].sub(
  689. RegExStorage.charSpecial[key][1], retStr)
  690. return retStr.strip()
  691. # open the gui
  692. if __name__ == "__main__":
  693. main()