PageRenderTime 38ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/crawl/httputil.py

https://github.com/epigos/py-crawl
Python | 51 lines | 48 code | 3 blank | 0 comment | 13 complexity | 1367ab9bc5ea6ef8d88522df6cd88381 MD5 | raw file
  1. import urllib2
  2. import re
  3. import sys
  4. import webobj
  5. import redis
  6. from sets import Set
  7. hrefregex = re.compile('<a\shref=[\'"](.*?)[\'"]')
  8. metaregex = re.compile('<meta\s(.*?)=[\'"](.*?)[\'"]\s(.*?)=[\'"](.*?)[\'"]')
  9. contentregex = re.compile('content=[\'"](.*?)[\'"]')
  10. httpdomainregex = re.compile('[(http).*?(www).*?]://(.*?).com')
  11. class util:
  12. def __init__(self):
  13. self.hrefregex = re.compile('<a\shref=[\'"](.*?)[\'"]')
  14. self.metaregex = re.compile('<meta\s(.*?)=[\'"](.*?)[\'"]\s(.*?)=[\'"](.*?)[\'"]')
  15. self.contentregex = re.compile('content=[\'"](.*?)[\'"]')
  16. self.httpdomainregex = re.compile('[(http).*?(www).*?]://(.*?).com')
  17. def getmeta(self, url):
  18. keyvalue = {}
  19. try:
  20. response = urllib2.urlopen(url)
  21. for line in response:
  22. list = self.metaregex.findall(line.rstrip())
  23. if len(list)!= 0:
  24. for i in list:
  25. keyvalue[i[1]]=i[3]
  26. except Exception as e:
  27. print e
  28. return keyvalue
  29. def getlinks(self,url):
  30. links = []
  31. try:
  32. response = urllib2.urlopen(url)
  33. for line in response:
  34. list = self.hrefregex.findall(line.rstrip())
  35. if len(list) != 0:
  36. for i in list:
  37. links.append(i)
  38. except Exception as e :
  39. print e
  40. return self.pruneduplicatelinks(links)
  41. def pruneduplicatelinks(self, links):
  42. uniquedomains = Set()
  43. for i in links:
  44. domainlist = self.httpdomainregex.findall(i)
  45. if len(domainlist) != 0:
  46. for j in domainlist:
  47. uniquedomains.add("http://"+j+".com") #temp-fix: prepend and append http and com for now
  48. return uniquedomains