/diveintopython.ru/scrape.py

https://github.com/pcsforeducation/diveintopython · Python · 127 lines · 91 code · 17 blank · 19 comment · 43 complexity · 250d8c6726b910d86a46bb1ff2e1c484 MD5 · raw file

  1. from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup, Comment
  2. import urllib
  3. import os
  4. import shutil
  5. import string
  6. import re
  7. URL='http://ru.diveintopython.net/'
  8. GOOGLE_ANALYTICS_KEY = 'UA-9740779-18'
  9. def scrape():
  10. try:
  11. p = open('dip.html', 'r')
  12. soup = BeautifulSoup(p.read())
  13. except IOError, e:
  14. print "io error code: %d msg: %s" % (e.returncode, e.message)
  15. return None
  16. for i in soup.findAll('a'):
  17. if i.has_key('href'):
  18. if i['href'][0:4] == 'http' and '#' not in i['href']:
  19. try:
  20. filename = i['href'].split('/')[-2] + '/' + i['href'].split('/')[-1]
  21. print "saving %s into %s" % (i['href'], filename, )
  22. if not os.path.exists(i['href'].split('/')[-2]):
  23. os.mkdir(i['href'].split('/')[-2])
  24. with open(filename, 'w') as out:
  25. out.write(urllib.urlopen(i['href']).read())
  26. except IOError, e:
  27. pass
  28. def purify(filename):
  29. with open(filename, 'r') as f:
  30. soup = BeautifulSoup(f)
  31. print "working on %s" % (filename, )
  32. for div in soup.findAll('div'):
  33. if div.has_key('id'):
  34. if div['id'] == 'wm-ipp':
  35. div.extract()
  36. for script in soup.findAll('script'):
  37. script.extract()
  38. for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
  39. comment.extract()
  40. for link in soup.findAll('link'):
  41. if link.has_key('rev'):
  42. if link['rev'] == 'made':
  43. link['href'] = 'josh@servercobra.com'
  44. if link.has_key('rel'):
  45. if link['rel'] == "home":
  46. link['href'] = URL
  47. if link['rel'] == "stylesheet":
  48. link['href'] = "/css/diveintopython.css"
  49. if link['rel'] == "next" or link['rel'] == "up" or link['rel'] == "previous":
  50. link['href'] = URL + '/'.join(link['href'].split('/')[8:])
  51. for a in soup.findAll('a'):
  52. if a.has_key('href'):
  53. if 'http://web.archive.org/' in a['href']:
  54. print "print cleaning up link: %s" % (a['href'])
  55. a['href'] = URL + '/'.join(a['href'].split('/')[8:])
  56. if 'mailto:' in a['href']:
  57. a['href'] = 'mailto:josh@servercobra.com'
  58. #a['href'] = 'http://www.diveintopython.net/' a['href'].split('/')[8:]
  59. #if 'http://diveintopython.net/' in a['href']:
  60. for form in soup.findAll('form'):
  61. if form.has_key('action'):
  62. if 'http://web.archive.org/' in form['action']:
  63. form['action'] = 'http://www.google.com/' + '/'.join(form['action'].split('/')[8:])
  64. for img in soup.findAll('img'):
  65. if img.has_key('src'):
  66. if 'http://web.archive.org/' in img['src']:
  67. img['src'] = URL + '/'.join(img['src'].split('/')[8:])
  68. #TODO: insert Google Analytics
  69. #soup.head.insert(len(a.head.contents), '<!-- comment -->')
  70. # Insert Google Analytics Async Tracking Code
  71. code = '''<script type="text/javascript">
  72. var _gaq = _gaq || [];
  73. _gaq.push(['_setAccount', '%s']);
  74. _gaq.push(['_trackPageview']);
  75. (function() {
  76. var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
  77. ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
  78. var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  79. })();
  80. </script>''' % (GOOGLE_ANALYTICS_KEY, )
  81. if GOOGLE_ANALYTICS_KEY not in soup.head.contents:
  82. soup.head.insert(len(soup.head.contents), code)
  83. new_soup = BeautifulSoup(soup.renderContents())
  84. for i in new_soup.findAll('a'):
  85. if i.has_key('href'):
  86. if i['href'][0:4] == 'http':
  87. #print i['href']
  88. pass
  89. with open(filename, 'w') as out:
  90. out.write(new_soup.renderContents())
  91. #def replace_url(old, new):
  92. #for file in os.listdir('/home/josh/programming/diveintopython'):
  93. #if os.path.isdir(file):
  94. #directory = file
  95. #for f in os.listdir(file):
  96. #if 'html' in f:
  97. #with open(directory + '/' + f, 'w+') as f2:
  98. #text = f2.read()
  99. #f2.write(re.sub('http://diveintopython.net', 'http://www.diveintopython.net', text))
  100. if __name__ == '__main__':
  101. #scrape()
  102. for file in os.listdir('/home/josh/programming/diveintopython.ru'):
  103. #if os.path.isdir(file):
  104. #directory = file
  105. #for f in os.listdir(file):
  106. if 'html' in file:
  107. purify(file)