/parseDelscrHTM-1.py

https://github.com/gtani7/pyrb--python-scrape-spider · Python · 61 lines · 51 code · 5 blank · 5 comment · 2 complexity · 53a011922bc9b3dac237d87116b89de1 MD5 · raw file

  1. #!/usr/bin/env python
  2. import sys
  3. class Mag_BSoup(object):
  4. urlsExtracted=[]
  5. allh3tags=[];
  6. # import re
  7. # url_re=re.compile(r'<a href="(.*)" onmousedown=')
  8. def __init__(self,mag_file): # mag_file is string, not file obj
  9. from BeautifulSoup import BeautifulSoup
  10. soup = BeautifulSoup(mag_file)
  11. self.__class__.allh3tags=soup('h3') # shd be 13, 1st 10 are bookmark tags, last 3 are spurious
  12. self.__class__.allh3tags=self.__class__.allh3tags[0:10]
  13. def selecth3tag(self):
  14. for thish3tag in self.__class__.allh3tags:
  15. # urlextracted=self.__class__.url_re.match(string(h3tag) ) #TypeErr: tag is not str, can't be fed to regex
  16. print thish3tag.aTag['href']
  17. self.__class__.urlsExtracted.append(thish3tag.aTag.href)
  18. ####### Reddit ###########
  19. class Red_BSoup(object):
  20. allcolspan3=[]
  21. urlsExtracted=[]
  22. def __init__(self,red_file):
  23. from BeautifulSoup import BeautifulSoup
  24. soup = BeautifulSoup(red_file)
  25. self.__class__.allcolspan3=soup.findAll(name='td',attrs={colspan:"3"})
  26. def build_dict(filename):
  27. ''' it's just too annoying trying to get Python SYCK or other YAML parsers to work'''
  28. import re
  29. blank_line=r'--- ' ## shd only be 1st line
  30. blankline_re=re.compile(blank_line)
  31. dict2return={}
  32. for inp_line in open(filename, "r"):
  33. if blankline_re.match(inp_line): continue
  34. (url, numoccur)=inp_line.split(r': ')
  35. dict2return[url]=int(numoccur)
  36. return dict2return
  37. if __name__ == "__main__":
  38. import os,fileinput, glob
  39. [pyurls,rburls,railsurls]=map(build_dict, ["pyurls.yml","rburls.yml","railsurls.yml"])
  40. sys.stdout.write("Len of py, rb URLs dicts: " +repr(len(pyurls.keys()))+", "+repr(len(rburls.keys()))+"\n")
  41. sys.stdout.write("Len of rails urls dict: " +repr(len(railsurls.keys()))+"\n")
  42. # pyurls=pyurls.keys(); rburls=rburls.keys(); railsurls=railsurls.keys();
  43. allurls=pyurls.keys() + rburls.keys() + railsurls.keys()
  44. ## EDIT "cwd" as neces
  45. # os.chdir("C:\\kacc\\pyrb\\magnol_pages")
  46. os.chdir("C:\\kacc\\pyrb\\reddit_pages")
  47. for magfilename in glob.glob("rub*.htm*"): ## EDIT filename glob as neces
  48. magfilehand=open(magfilename)
  49. ent_magfile=magfilehand.read()
  50. magfileparsed=Mag_BSoup(ent_magfile)
  51. magfileparsed.selecth3tag()