/tsing_spider/util/pyurllib.py

https://github.com/TsingJyujing/DataSpider · Python · 196 lines · 96 code · 28 blank · 72 comment · 13 complexity · e87031c2cd0a0f321af4e3ffb6127a30 MD5 · raw file

  1. #!/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on 2017-2-3
  5. @author: Yuan Yifan
  6. Some http client utility functions & classes
  7. """
  8. import logging
  9. import os
  10. import threading
  11. from bs4 import BeautifulSoup
  12. from tsing_spider.config import (
  13. get_request_timeout,
  14. get_xml_decoder,
  15. get_request_header,
  16. get_request_session
  17. )
  18. log = logging.getLogger(__file__)
  19. def http_get(url: str, headers: dict = None):
  20. """
  21. Get raw data by URL
  22. :param headers: external headers
  23. :param url:
  24. :return:
  25. """
  26. log.debug("Trying to get url: {}".format(url))
  27. response = get_request_session().get(
  28. url,
  29. timeout=get_request_timeout(),
  30. headers=get_request_header(url, headers),
  31. verify=False,
  32. )
  33. response.raise_for_status()
  34. return response.content
  35. def http_get_soup(url: str):
  36. """
  37. Get soup
  38. :param url:
  39. :return:
  40. """
  41. return BeautifulSoup(http_get(url), get_xml_decoder()) # html.parser
  42. def __download_callback(block_download_count: int, block_size: int, file_size: int, display_name: str = "FILE"):
  43. """
  44. Display download progress for debugging
  45. :param block_download_count:
  46. :param block_size:
  47. :param file_size:
  48. :param display_name:
  49. :return:
  50. """
  51. process_percent = block_download_count * block_size * 100.0 / file_size
  52. print("%f%% of %s" % (process_percent, display_name))
  53. class LiteFileDownloader(threading.Thread):
  54. """
  55. Little data download to file
  56. """
  57. def __init__(self, image_url, filename):
  58. threading.Thread.__init__(self)
  59. self.image_url = image_url
  60. self.filename = filename
  61. self.done = 0
  62. def run(self):
  63. if not os.path.exists(self.filename): # Already downloaded
  64. data = http_get(url=self.image_url)
  65. if data is not None:
  66. with open(self.filename, 'wb') as fid:
  67. fid.write(data)
  68. class LiteDataDownloader(threading.Thread):
  69. """
  70. Little data download to RAM buffer
  71. """
  72. def __init__(self, image_url, tag):
  73. threading.Thread.__init__(self)
  74. self.image_url = image_url
  75. self.data = None
  76. self.tag = tag
  77. def run(self):
  78. self.data = http_get(url=self.image_url)
  79. def write_file(self, filename):
  80. if self.data is not None:
  81. with open(filename, 'wb') as fid:
  82. fid.write(self.data)
  83. class DownloadTask(threading.Thread):
  84. """
  85. Large file download to file
  86. """
  87. def __init__(self, url: str, filepath: str, chuck_size: int = 81920, headers: dict = None):
  88. threading.Thread.__init__(self)
  89. self.url = url
  90. self.filepath = filepath
  91. self.chuck_size = chuck_size
  92. self.downloaded_size = 0
  93. self.done = False
  94. self.__headers = get_request_header(self.url, headers)
  95. def run(self):
  96. with open(self.filepath, "wb") as fp:
  97. with get_request_session().get(
  98. self.url,
  99. stream=True,
  100. timeout=get_request_timeout(),
  101. headers=self.__headers,
  102. verify=False,
  103. ) as response:
  104. response.raise_for_status()
  105. chucks = (chuck for chuck in response.iter_content(chunk_size=self.chuck_size) if chuck)
  106. for chuck in chucks:
  107. self.downloaded_size += len(chuck)
  108. fp.write(chuck)
  109. self.done = True
  110. class LazyContent:
  111. """
  112. Lazy-loaded URL resource
  113. """
  114. def __init__(self, url: str, headers: dict = None):
  115. self._url = url
  116. self.__data = None
  117. self.__headers = get_request_header(self._url, headers)
  118. @property
  119. def content(self):
  120. """
  121. Get the content of the request
  122. :return:
  123. """
  124. if not self.is_initialized:
  125. self.set_content(http_get(self._url, self.__headers))
  126. return self.__data
  127. @property
  128. def is_initialized(self) -> bool:
  129. return self.__data is not None
  130. @property
  131. def url(self) -> str:
  132. """
  133. The only safe way to get (can't modify) url
  134. :return:
  135. """
  136. return self._url
  137. def reset_content(self):
  138. """
  139. Reset content for load again
  140. :return:
  141. """
  142. self.__data = None
  143. def set_content(self, value):
  144. """
  145. Set content manually (but not recommended call outside)
  146. :return:
  147. """
  148. self.__data = value
  149. class LazySoup(LazyContent):
  150. """
  151. Lazy-loaded URL resource, and parse by BeautifulSoup
  152. """
  153. def __init__(self, url: str, parser: str = None, headers: dict = None):
  154. self.__parser = parser if parser is not None else get_xml_decoder()
  155. self.__soup = None
  156. LazyContent.__init__(self, url, headers)
  157. @property
  158. def soup(self):
  159. if self.__soup is None:
  160. self.__soup = BeautifulSoup(self.content, self.__parser)
  161. return self.__soup