/6.py

https://github.com/KenjiTakahashi/kurs-Python
Python | 141 lines | 133 code | 7 blank | 1 comment | 40 complexity | 3068cf2fcab57f963ae3c6456ace6500 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import html.parser
  3. import os
  4. import urllib.request
  5. from urllib.error import URLError
  6. from threading import Thread, RLock
  7. from timeit import Timer
  8. class ActivityBackend(html.parser.HTMLParser):
  9. def __init__(self, directory, rlock, results):
  10. self.directory = directory
  11. self.results = results
  12. self.rlock = rlock
  13. super(type(self), self).__init__(self)
  14. def handle_starttag(self, tag, attrs):
  15. if tag == 'a' or tag == 'img':
  16. for (attr, value) in attrs:
  17. if attr == 'href' or attr == 'src':
  18. if '://' in value:
  19. try:
  20. urllib.request.urlopen(value)
  21. except URLError:
  22. with self.rlock:
  23. self.results.add((value, False))
  24. else:
  25. with self.rlock:
  26. self.results.add((value, True))
  27. else:
  28. try:
  29. open(self.directory + value)
  30. except IOError as e:
  31. if e.errno == 21:
  32. with self.rlock:
  33. self.results.add((value, True))
  34. else:
  35. with self.rlock:
  36. self.results.add((value, False))
  37. else:
  38. with self.rlock:
  39. self.results.add((value, True))
  40. class ActivityChecker(object):
  41. def __init__(self, directory):
  42. self.directory = directory
  43. self.rlock = RLock()
  44. self.results = set()
  45. def check(self):
  46. threads = list()
  47. for root, _, filenames in os.walk(self.directory):
  48. for filename in filenames:
  49. if os.path.splitext(filename)[1] == '.html':
  50. with open(os.path.join(root, filename)) as data:
  51. thread = Thread(
  52. target = ActivityBackend(
  53. self.directory,
  54. self.rlock,
  55. self.results
  56. ).feed,
  57. args = (data.read(),)
  58. )
  59. threads.append(thread)
  60. thread.start()
  61. for t in threads:
  62. t.join()
  63. return self.results
  64. def pretty_print(self):
  65. const = 0
  66. for (lnk, _) in self.results:
  67. if len(lnk) > const:
  68. const = len(lnk)
  69. const += 10
  70. print("<link>{0}".format("<status>".rjust(const + 2)))
  71. for (lnk, status) in self.results:
  72. print("{0}{1}".format(
  73. lnk, str(status).rjust(const - len(lnk) + len(str(status)))
  74. ))
  75. ac = ActivityChecker(os.path.expanduser('~/kenjitakahashi.github.com/_site'))
  76. t = Timer(ac.check)
  77. print(t.timeit(4))
  78. ac.pretty_print()
  79. class ReferenceBackend(html.parser.HTMLParser):
  80. def __init__(self, rlock, results, curfile):
  81. self.rlock = rlock
  82. self.results = results
  83. self.curfile = curfile
  84. super(type(self), self).__init__(self)
  85. def handle_starttag(self, tag, attrs):
  86. if tag == 'a':
  87. for (attr, value) in attrs:
  88. if attr == 'href':
  89. with self.rlock:
  90. self.results.setdefault(
  91. value, {self.curfile}
  92. ).add(self.curfile)
  93. class ReferenceChecker(object):
  94. def __init__(self):
  95. self.rlock = RLock()
  96. self.results = dict()
  97. def check(self, directory):
  98. threads = list()
  99. for root, _, filenames in os.walk(directory):
  100. for filename in filenames:
  101. curfile = os.path.join(
  102. os.path.relpath(root, directory), filename
  103. )
  104. with open(os.path.join(root, filename)) as data:
  105. thread = Thread(
  106. target = ReferenceBackend(
  107. self.rlock,
  108. self.results,
  109. curfile
  110. ).feed,
  111. args = (data.read(),)
  112. )
  113. threads.append(thread)
  114. thread.start()
  115. for t in threads:
  116. t.join()
  117. return self.results
  118. def pretty_print(self):
  119. const = 0
  120. for k in self.results.keys():
  121. if len(k) > const:
  122. const = len(k)
  123. const += 10
  124. print("<link>{0}".format("<references>".rjust(const + 6)))
  125. for k, v in self.results.items():
  126. item = v.pop()
  127. print("{0}{1}".format(k, item.rjust(const - len(k) + len(item))))
  128. while v:
  129. item = v.pop()
  130. print(item.rjust(const + len(item)))
  131. rc = ReferenceChecker()
  132. t = Timer(lambda : rc.check(os.path.expanduser('~/kenjitakahashi.github.com/_site')))
  133. print(t.timeit(4))
  134. rc.pretty_print()