PageRenderTime 52ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/crunchy-xml-decoder/wget.py

https://gitlab.com/eientei95/crunchy-xml-decoder
Python | 399 lines | 358 code | 10 blank | 31 comment | 16 complexity | 2cdbfbb5cba667dd6dd8f018c1b9fd23 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. Download utility as an easy way to get file from the net
  4. python -m wget <URL>
  5. python wget.py <URL>
  6. Downloads: http://pypi.python.org/pypi/wget/
  7. Development: http://bitbucket.org/techtonik/python-wget/
  8. wget.py is not option compatible with Unix wget utility,
  9. to make command line interface intuitive for new people.
  10. Public domain by anatoly techtonik <techtonik@gmail.com>
  11. Also available under the terms of MIT license
  12. Copyright (c) 2010-2014 anatoly techtonik
  13. """
  14. import sys, shutil, os
  15. import tempfile
  16. import math
  17. PY3K = sys.version_info >= (3, 0)
  18. if PY3K:
  19. import urllib.request as urllib
  20. import urllib.parse as urlparse
  21. else:
  22. import urllib
  23. import urlparse
  24. __version__ = "2.2"
  25. def filename_from_url(url):
  26. """:return: detected filename or None"""
  27. fname = os.path.basename(urlparse.urlparse(url).path)
  28. if len(fname.strip(" \n\t.")) == 0:
  29. return None
  30. return fname
  31. def filename_from_headers(headers):
  32. """Detect filename from Content-Disposition headers if present.
  33. http://greenbytes.de/tech/tc2231/
  34. :param: headers as dict, list or string
  35. :return: filename from content-disposition header or None
  36. """
  37. if type(headers) == str:
  38. headers = headers.splitlines()
  39. if type(headers) == list:
  40. headers = dict([x.split(':', 1) for x in headers])
  41. cdisp = headers.get("Content-Disposition")
  42. if not cdisp:
  43. return None
  44. cdtype = cdisp.split(';')
  45. if len(cdtype) == 1:
  46. return None
  47. if cdtype[0].strip().lower() not in ('inline', 'attachment'):
  48. return None
  49. # several filename params is illegal, but just in case
  50. fnames = [x for x in cdtype[1:] if x.strip().startswith('filename=')]
  51. if len(fnames) > 1:
  52. return None
  53. name = fnames[0].split('=')[1].strip(' \t"')
  54. name = os.path.basename(name)
  55. if not name:
  56. return None
  57. return name
  58. def filename_fix_existing(filename):
  59. """Expands name portion of filename with numeric ' (x)' suffix to
  60. return filename that doesn't exist already.
  61. """
  62. dirname = '.'
  63. name, ext = filename.rsplit('.', 1)
  64. names = [x for x in os.listdir(dirname) if x.startswith(name)]
  65. names = [x.rsplit('.', 1)[0] for x in names]
  66. suffixes = [x.replace(name, '') for x in names]
  67. # filter suffixes that match ' (x)' pattern
  68. suffixes = [x[2:-1] for x in suffixes
  69. if x.startswith(' (') and x.endswith(')')]
  70. indexes = [int(x) for x in suffixes
  71. if set(x) <= set('0123456789')]
  72. idx = 1
  73. if indexes:
  74. idx += sorted(indexes)[-1]
  75. return '%s (%d).%s' % (name, idx, ext)
  76. # --- terminal/console output helpers ---
  77. def get_console_width():
  78. """Return width of available window area. Autodetection works for
  79. Windows and POSIX platforms. Returns 80 for others
  80. Code from http://bitbucket.org/techtonik/python-pager
  81. """
  82. if os.name == 'nt':
  83. STD_INPUT_HANDLE = -10
  84. STD_OUTPUT_HANDLE = -11
  85. STD_ERROR_HANDLE = -12
  86. # get console handle
  87. from ctypes import windll, Structure, byref
  88. try:
  89. from ctypes.wintypes import SHORT, WORD, DWORD
  90. except ImportError:
  91. # workaround for missing types in Python 2.5
  92. from ctypes import (
  93. c_short as SHORT, c_ushort as WORD, c_ulong as DWORD)
  94. console_handle = windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
  95. # CONSOLE_SCREEN_BUFFER_INFO Structure
  96. class COORD(Structure):
  97. _fields_ = [("X", SHORT), ("Y", SHORT)]
  98. class SMALL_RECT(Structure):
  99. _fields_ = [("Left", SHORT), ("Top", SHORT),
  100. ("Right", SHORT), ("Bottom", SHORT)]
  101. class CONSOLE_SCREEN_BUFFER_INFO(Structure):
  102. _fields_ = [("dwSize", COORD),
  103. ("dwCursorPosition", COORD),
  104. ("wAttributes", WORD),
  105. ("srWindow", SMALL_RECT),
  106. ("dwMaximumWindowSize", DWORD)]
  107. sbi = CONSOLE_SCREEN_BUFFER_INFO()
  108. ret = windll.kernel32.GetConsoleScreenBufferInfo(console_handle, byref(sbi))
  109. if ret == 0:
  110. return 0
  111. return sbi.srWindow.Right+1
  112. elif os.name == 'posix':
  113. from fcntl import ioctl
  114. from termios import TIOCGWINSZ
  115. from array import array
  116. winsize = array("H", [0] * 4)
  117. try:
  118. ioctl(sys.stdout.fileno(), TIOCGWINSZ, winsize)
  119. except IOError:
  120. pass
  121. return (winsize[1], winsize[0])[0]
  122. return 80
  123. def bar_thermometer(current, total, width=80):
  124. """Return thermometer style progress bar string. `total` argument
  125. can not be zero. The minimum size of bar returned is 3. Example:
  126. [.......... ]
  127. Control and trailing symbols (\r and spaces) are not included.
  128. See `bar_adaptive` for more information.
  129. """
  130. # number of dots on thermometer scale
  131. avail_dots = width-2
  132. shaded_dots = int(math.floor(float(current) / total * avail_dots))
  133. return '[' + '.'*shaded_dots + ' '*(avail_dots-shaded_dots) + ']'
  134. def bar_adaptive(current, total, width=80):
  135. """Return progress bar string for given values in one of three
  136. styles depending on available width:
  137. [.. ] downloaded / total
  138. downloaded / total
  139. [.. ]
  140. if total value is unknown or <= 0, show bytes counter using two
  141. adaptive styles:
  142. %s / unknown
  143. %s
  144. if there is not enough space on the screen, do not display anything
  145. returned string doesn't include control characters like \r used to
  146. place cursor at the beginning of the line to erase previous content.
  147. this function leaves one free character at the end of string to
  148. avoid automatic linefeed on Windows.
  149. """
  150. # process special case when total size is unknown and return immediately
  151. if not total or total < 0:
  152. msg = "%s / unknown" % current
  153. if len(msg) < width: # leaves one character to avoid linefeed
  154. return msg
  155. if len("%s" % current) < width:
  156. return "%s" % current
  157. # --- adaptive layout algorithm ---
  158. #
  159. # [x] describe the format of the progress bar
  160. # [x] describe min width for each data field
  161. # [x] set priorities for each element
  162. # [x] select elements to be shown
  163. # [x] choose top priority element min_width < avail_width
  164. # [x] lessen avail_width by value if min_width
  165. # [x] exclude element from priority list and repeat
  166. # 10% [.. ] 10/100
  167. # pppp bbbbb sssssss
  168. min_width = {
  169. 'percent': 4, # 100%
  170. 'bar': 3, # [.]
  171. 'size': len("%s" % total)*2 + 3, # 'xxxx / yyyy'
  172. }
  173. priority = ['percent', 'bar', 'size']
  174. # select elements to show
  175. selected = []
  176. avail = width
  177. for field in priority:
  178. if min_width[field] < avail:
  179. selected.append(field)
  180. avail -= min_width[field]+1 # +1 is for separator or for reserved space at
  181. # the end of line to avoid linefeed on Windows
  182. # render
  183. output = ''
  184. for field in selected:
  185. if field == 'percent':
  186. # fixed size width for percentage
  187. output += ('%s%%' % (100 * current // total)).rjust(min_width['percent'])
  188. elif field == 'bar': # [. ]
  189. # bar takes its min width + all available space
  190. output += bar_thermometer(current, total, min_width['bar']+avail)
  191. elif field == 'size':
  192. # size field has a constant width (min == max)
  193. output += ("%s / %s" % (current, total)).rjust(min_width['size'])
  194. selected = selected[1:]
  195. if selected:
  196. output += ' ' # add field separator
  197. return output
  198. # --/ console helpers
  199. __current_size = 0 # global state variable, which exists solely as a
  200. # workaround against Python 3.3.0 regression
  201. # http://bugs.python.org/issue16409
  202. # fixed in Python 3.3.1
  203. def callback_progress(blocks, block_size, total_size, bar_function):
  204. """callback function for urlretrieve that is called when connection is
  205. created and when once for each block
  206. draws adaptive progress bar in terminal/console
  207. use sys.stdout.write() instead of "print,", because it allows one more
  208. symbol at the line end without linefeed on Windows
  209. :param blocks: number of blocks transferred so far
  210. :param block_size: in bytes
  211. :param total_size: in bytes, can be -1 if server doesn't return it
  212. :param bar_function: another callback function to visualize progress
  213. """
  214. global __current_size
  215. width = min(100, get_console_width())
  216. if sys.version_info[:3] == (3, 3, 0): # regression workaround
  217. if blocks == 0: # first call
  218. __current_size = 0
  219. else:
  220. __current_size += block_size
  221. current_size = __current_size
  222. else:
  223. current_size = min(blocks*block_size, total_size)
  224. progress = bar_function(current_size, total_size, width)
  225. if progress:
  226. sys.stdout.write("\r" + progress)
  227. def download(url, out=None, bar=bar_adaptive):
  228. """High level function, which downloads URL into tmp file in current
  229. directory and then renames it to filename autodetected from either URL
  230. or HTTP headers.
  231. :param bar: function to track download progress (visualize etc.)
  232. :param out: output filename or directory
  233. :return: filename where URL is downloaded to
  234. """
  235. names = dict()
  236. names["out"] = out or ''
  237. names["url"] = filename_from_url(url)
  238. # get filename for temp file in current directory
  239. prefix = (names["url"] or names["out"] or ".") + "."
  240. (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".")
  241. os.close(fd)
  242. os.unlink(tmpfile)
  243. # set progress monitoring callback
  244. def callback_charged(blocks, block_size, total_size):
  245. # 'closure' to set bar drawing function in callback
  246. callback_progress(blocks, block_size, total_size, bar_function=bar)
  247. if bar:
  248. callback = callback_charged
  249. else:
  250. callback = None
  251. (tmpfile, headers) = urllib.urlretrieve(url, tmpfile, callback)
  252. names["header"] = filename_from_headers(headers)
  253. if os.path.isdir(names["out"]):
  254. filename = names["header"] or names["url"]
  255. filename = names["out"] + "/" + filename
  256. else:
  257. filename = names["out"] or names["header"] or names["url"]
  258. # add numeric ' (x)' suffix if filename already exists
  259. if os.path.exists(filename):
  260. filename = filename_fix_existing(filename)
  261. shutil.move(tmpfile, filename)
  262. #print headers
  263. return filename
  264. usage = """\
  265. usage: wget.py [options] URL
  266. options:
  267. -o --output FILE|DIR output filename or directory
  268. -h --help
  269. --version
  270. """
  271. if __name__ == "__main__":
  272. if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv:
  273. sys.exit(usage)
  274. if "--version" in sys.argv:
  275. sys.exit("wget.py " + __version__)
  276. from optparse import OptionParser
  277. parser = OptionParser()
  278. parser.add_option("-o", "--output", dest="output")
  279. (options, args) = parser.parse_args()
  280. url = sys.argv[1]
  281. filename = download(args[0], out=options.output)
  282. print("")
  283. print("Saved under %s" % filename)
  284. r"""
  285. features that require more tuits for urlretrieve API
  286. http://www.python.org/doc/2.6/library/urllib.html#urllib.urlretrieve
  287. [x] autodetect filename from URL
  288. [x] autodetect filename from headers - Content-Disposition
  289. http://greenbytes.de/tech/tc2231/
  290. [ ] make HEAD request to detect temp filename from Content-Disposition
  291. [ ] process HTTP status codes (i.e. 404 error)
  292. http://ftp.de.debian.org/debian/pool/iso-codes_3.24.2.orig.tar.bz2
  293. [ ] catch KeyboardInterrupt
  294. [ ] optionally preserve incomplete file
  295. [x] create temp file in current directory
  296. [ ] resume download (broken connection)
  297. [ ] resume download (incomplete file)
  298. [x] show progress indicator
  299. http://mail.python.org/pipermail/tutor/2005-May/038797.html
  300. [x] do not overwrite downloaded file
  301. [x] rename file automatically if exists
  302. [x] optionally specify path for downloaded file
  303. [ ] options plan
  304. [x] -h, --help, --version (CHAOS speccy)
  305. [ ] clpbar progress bar style
  306. _ 30.0Mb at 3.0 Mbps eta: 0:00:20 30% [===== ]
  307. [ ] test "bar \r" print with \r at the end of line on Windows
  308. [ ] process Python 2.x urllib.ContentTooShortError exception gracefully
  309. (ideally retry and continue download)
  310. (tmpfile, headers) = urllib.urlretrieve(url, tmpfile, callback_progress)
  311. File "C:\Python27\lib\urllib.py", line 93, in urlretrieve
  312. return _urlopener.retrieve(url, filename, reporthook, data)
  313. File "C:\Python27\lib\urllib.py", line 283, in retrieve
  314. "of %i bytes" % (read, size), result)
  315. urllib.ContentTooShortError: retrieval incomplete: got only 15239952 out of 24807571 bytes
  316. [ ] find out if urlretrieve may return unicode headers
  317. [ ] test suite for unsafe filenames from url and from headers
  318. [ ] security checks
  319. [ ] filename_from_url
  320. [ ] filename_from_headers
  321. [ ] MITM redirect from https URL
  322. [ ] https certificate check
  323. [ ] size+hash check helpers
  324. [ ] fail if size is known and mismatch
  325. [ ] fail if hash mismatch
  326. """