PageRenderTime 45ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/search_url/lib/html.py

https://bitbucket.org/charlisim/search_url_web
Python | 101 lines | 62 code | 11 blank | 28 comment | 10 complexity | b26f74082b16f19cfcf91b465c4b4a51 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. '''
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 3 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. Created on 09/12/2012
  14. @author: Carlos Simón <jcarlosimonv@gmail.com>
  15. '''
  16. import re
  17. from bs4 import BeautifulSoup
  18. from url import *
  19. class html:
  20. def __init__(self, content):
  21. self.content = BeautifulSoup(content)
  22. ''' return valid links '''
  23. def findLinks(self):
  24. allLinks = self.content.findAll('a')
  25. validLinks = []
  26. for link in allLinks:
  27. try:
  28. validLinks.append(link['href'])
  29. except KeyError:
  30. pass
  31. return validLinks
  32. ''' return links that contain extensions'''
  33. def findLinksExtension(self, links, extensions):
  34. linksExtension = []
  35. for link in links:
  36. for ext in extensions:
  37. if ext in link:
  38. linksExtension.insert(link)
  39. return linksExtension
  40. class listOfExceptions:
  41. def __init__(self):
  42. ''' Not move order, in check method use this order '''
  43. self.WEBS = ["cs50.tv", "/descargar.php"]
  44. def fix_CS50(self, url_toFix):
  45. ''' Search links of this page and look out for valid url
  46. Return only one URL. '''
  47. validURL = ''
  48. if '2012' in url_toFix:
  49. validURL = re.sub('http://cs50.tv', 'http://downloads.cs50.net', url_toFix)
  50. else:
  51. DOWNLOADS_VALUES = ['#download', '?download']
  52. try:
  53. ''' Call to url class and return HTML code of page '''
  54. cs50 = url(url_toFix).returnUrlContent()
  55. ''' Store all links '''
  56. links = html(cs50).findLinks()
  57. ''' Search all valid links '''
  58. for link in links:
  59. for value in DOWNLOADS_VALUES:
  60. if value in link:
  61. validURL = link
  62. except:
  63. pass
  64. return validURL
  65. def fixMejorEnVo(self, url_toFix, origin):
  66. fixUrl = ''
  67. if 'mejorenvo.com' in origin:
  68. fixUrl = 'http://www.mejorenvo.com' + url_toFix
  69. return fixUrl
  70. def check(self, urls, origin):
  71. self.url = []
  72. for url in urls:
  73. if self.WEBS[0] in url:
  74. self.url.append(self.fix_CS50(url))
  75. if self.WEBS[1] in url:
  76. if 'descargar' in url:
  77. self.url.append(self.fixMejorEnVo(url, origin))
  78. else:
  79. self.url.append(url)
  80. return self.url