PageRenderTime 53ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/lib-python/2.7/csv.py

https://bitbucket.org/quangquach/pypy
Python | 451 lines | 431 code | 6 blank | 14 comment | 4 complexity | 200315fd373c47e96562264ce04fb905 MD5 | raw file
  1. """
  2. csv.py - read/write/investigate CSV files
  3. """
  4. import re
  5. from functools import reduce
  6. from _csv import Error, __version__, writer, reader, register_dialect, \
  7. unregister_dialect, get_dialect, list_dialects, \
  8. field_size_limit, \
  9. QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  10. __doc__
  11. from _csv import Dialect as _Dialect
  12. try:
  13. from cStringIO import StringIO
  14. except ImportError:
  15. from StringIO import StringIO
  16. __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  17. "Error", "Dialect", "__doc__", "excel", "excel_tab",
  18. "field_size_limit", "reader", "writer",
  19. "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  20. "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  21. class Dialect:
  22. """Describe an Excel dialect.
  23. This must be subclassed (see csv.excel). Valid attributes are:
  24. delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  25. lineterminator, quoting.
  26. """
  27. _name = ""
  28. _valid = False
  29. # placeholders
  30. delimiter = None
  31. quotechar = None
  32. escapechar = None
  33. doublequote = None
  34. skipinitialspace = None
  35. lineterminator = None
  36. quoting = None
  37. def __init__(self):
  38. if self.__class__ != Dialect:
  39. self._valid = True
  40. self._validate()
  41. def _validate(self):
  42. try:
  43. _Dialect(self)
  44. except TypeError, e:
  45. # We do this for compatibility with py2.3
  46. raise Error(str(e))
  47. class excel(Dialect):
  48. """Describe the usual properties of Excel-generated CSV files."""
  49. delimiter = ','
  50. quotechar = '"'
  51. doublequote = True
  52. skipinitialspace = False
  53. lineterminator = '\r\n'
  54. quoting = QUOTE_MINIMAL
  55. register_dialect("excel", excel)
  56. class excel_tab(excel):
  57. """Describe the usual properties of Excel-generated TAB-delimited files."""
  58. delimiter = '\t'
  59. register_dialect("excel-tab", excel_tab)
  60. class DictReader:
  61. def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  62. dialect="excel", *args, **kwds):
  63. self._fieldnames = fieldnames # list of keys for the dict
  64. self.restkey = restkey # key to catch long rows
  65. self.restval = restval # default value for short rows
  66. self.reader = reader(f, dialect, *args, **kwds)
  67. self.dialect = dialect
  68. self.line_num = 0
  69. def __iter__(self):
  70. return self
  71. @property
  72. def fieldnames(self):
  73. if self._fieldnames is None:
  74. try:
  75. self._fieldnames = self.reader.next()
  76. except StopIteration:
  77. pass
  78. self.line_num = self.reader.line_num
  79. return self._fieldnames
  80. @fieldnames.setter
  81. def fieldnames(self, value):
  82. self._fieldnames = value
  83. def next(self):
  84. if self.line_num == 0:
  85. # Used only for its side effect.
  86. self.fieldnames
  87. row = self.reader.next()
  88. self.line_num = self.reader.line_num
  89. # unlike the basic reader, we prefer not to return blanks,
  90. # because we will typically wind up with a dict full of None
  91. # values
  92. while row == []:
  93. row = self.reader.next()
  94. d = dict(zip(self.fieldnames, row))
  95. lf = len(self.fieldnames)
  96. lr = len(row)
  97. if lf < lr:
  98. d[self.restkey] = row[lf:]
  99. elif lf > lr:
  100. for key in self.fieldnames[lr:]:
  101. d[key] = self.restval
  102. return d
  103. class DictWriter:
  104. def __init__(self, f, fieldnames, restval="", extrasaction="raise",
  105. dialect="excel", *args, **kwds):
  106. self.fieldnames = fieldnames # list of keys for the dict
  107. self.restval = restval # for writing short dicts
  108. if extrasaction.lower() not in ("raise", "ignore"):
  109. raise ValueError, \
  110. ("extrasaction (%s) must be 'raise' or 'ignore'" %
  111. extrasaction)
  112. self.extrasaction = extrasaction
  113. self.writer = writer(f, dialect, *args, **kwds)
  114. def writeheader(self):
  115. header = dict(zip(self.fieldnames, self.fieldnames))
  116. self.writerow(header)
  117. def _dict_to_list(self, rowdict):
  118. if self.extrasaction == "raise":
  119. wrong_fields = [k for k in rowdict if k not in self.fieldnames]
  120. if wrong_fields:
  121. raise ValueError("dict contains fields not in fieldnames: " +
  122. ", ".join(wrong_fields))
  123. return [rowdict.get(key, self.restval) for key in self.fieldnames]
  124. def writerow(self, rowdict):
  125. return self.writer.writerow(self._dict_to_list(rowdict))
  126. def writerows(self, rowdicts):
  127. rows = []
  128. for rowdict in rowdicts:
  129. rows.append(self._dict_to_list(rowdict))
  130. return self.writer.writerows(rows)
  131. # Guard Sniffer's type checking against builds that exclude complex()
  132. try:
  133. complex
  134. except NameError:
  135. complex = float
  136. class Sniffer:
  137. '''
  138. "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
  139. Returns a Dialect object.
  140. '''
  141. def __init__(self):
  142. # in case there is more than one possible delimiter
  143. self.preferred = [',', '\t', ';', ' ', ':']
  144. def sniff(self, sample, delimiters=None):
  145. """
  146. Returns a dialect (or None) corresponding to the sample
  147. """
  148. quotechar, doublequote, delimiter, skipinitialspace = \
  149. self._guess_quote_and_delimiter(sample, delimiters)
  150. if not delimiter:
  151. delimiter, skipinitialspace = self._guess_delimiter(sample,
  152. delimiters)
  153. if not delimiter:
  154. raise Error, "Could not determine delimiter"
  155. class dialect(Dialect):
  156. _name = "sniffed"
  157. lineterminator = '\r\n'
  158. quoting = QUOTE_MINIMAL
  159. # escapechar = ''
  160. dialect.doublequote = doublequote
  161. dialect.delimiter = delimiter
  162. # _csv.reader won't accept a quotechar of ''
  163. dialect.quotechar = quotechar or '"'
  164. dialect.skipinitialspace = skipinitialspace
  165. return dialect
  166. def _guess_quote_and_delimiter(self, data, delimiters):
  167. """
  168. Looks for text enclosed between two identical quotes
  169. (the probable quotechar) which are preceded and followed
  170. by the same character (the probable delimiter).
  171. For example:
  172. ,'some text',
  173. The quote with the most wins, same with the delimiter.
  174. If there is no quotechar the delimiter can't be determined
  175. this way.
  176. """
  177. matches = []
  178. for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
  179. '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
  180. '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
  181. '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
  182. regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
  183. matches = regexp.findall(data)
  184. if matches:
  185. break
  186. if not matches:
  187. # (quotechar, doublequote, delimiter, skipinitialspace)
  188. return ('', False, None, 0)
  189. quotes = {}
  190. delims = {}
  191. spaces = 0
  192. for m in matches:
  193. n = regexp.groupindex['quote'] - 1
  194. key = m[n]
  195. if key:
  196. quotes[key] = quotes.get(key, 0) + 1
  197. try:
  198. n = regexp.groupindex['delim'] - 1
  199. key = m[n]
  200. except KeyError:
  201. continue
  202. if key and (delimiters is None or key in delimiters):
  203. delims[key] = delims.get(key, 0) + 1
  204. try:
  205. n = regexp.groupindex['space'] - 1
  206. except KeyError:
  207. continue
  208. if m[n]:
  209. spaces += 1
  210. quotechar = reduce(lambda a, b, quotes = quotes:
  211. (quotes[a] > quotes[b]) and a or b, quotes.keys())
  212. if delims:
  213. delim = reduce(lambda a, b, delims = delims:
  214. (delims[a] > delims[b]) and a or b, delims.keys())
  215. skipinitialspace = delims[delim] == spaces
  216. if delim == '\n': # most likely a file with a single column
  217. delim = ''
  218. else:
  219. # there is *no* delimiter, it's a single column of quoted data
  220. delim = ''
  221. skipinitialspace = 0
  222. # if we see an extra quote between delimiters, we've got a
  223. # double quoted format
  224. dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
  225. {'delim':delim, 'quote':quotechar}, re.MULTILINE)
  226. if dq_regexp.search(data):
  227. doublequote = True
  228. else:
  229. doublequote = False
  230. return (quotechar, doublequote, delim, skipinitialspace)
  231. def _guess_delimiter(self, data, delimiters):
  232. """
  233. The delimiter /should/ occur the same number of times on
  234. each row. However, due to malformed data, it may not. We don't want
  235. an all or nothing approach, so we allow for small variations in this
  236. number.
  237. 1) build a table of the frequency of each character on every line.
  238. 2) build a table of frequencies of this frequency (meta-frequency?),
  239. e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
  240. 7 times in 2 rows'
  241. 3) use the mode of the meta-frequency to determine the /expected/
  242. frequency for that character
  243. 4) find out how often the character actually meets that goal
  244. 5) the character that best meets its goal is the delimiter
  245. For performance reasons, the data is evaluated in chunks, so it can
  246. try and evaluate the smallest portion of the data possible, evaluating
  247. additional chunks as necessary.
  248. """
  249. data = filter(None, data.split('\n'))
  250. ascii = [chr(c) for c in range(127)] # 7-bit ASCII
  251. # build frequency tables
  252. chunkLength = min(10, len(data))
  253. iteration = 0
  254. charFrequency = {}
  255. modes = {}
  256. delims = {}
  257. start, end = 0, min(chunkLength, len(data))
  258. while start < len(data):
  259. iteration += 1
  260. for line in data[start:end]:
  261. for char in ascii:
  262. metaFrequency = charFrequency.get(char, {})
  263. # must count even if frequency is 0
  264. freq = line.count(char)
  265. # value is the mode
  266. metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
  267. charFrequency[char] = metaFrequency
  268. for char in charFrequency.keys():
  269. items = charFrequency[char].items()
  270. if len(items) == 1 and items[0][0] == 0:
  271. continue
  272. # get the mode of the frequencies
  273. if len(items) > 1:
  274. modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
  275. items)
  276. # adjust the mode - subtract the sum of all
  277. # other frequencies
  278. items.remove(modes[char])
  279. modes[char] = (modes[char][0], modes[char][1]
  280. - reduce(lambda a, b: (0, a[1] + b[1]),
  281. items)[1])
  282. else:
  283. modes[char] = items[0]
  284. # build a list of possible delimiters
  285. modeList = modes.items()
  286. total = float(chunkLength * iteration)
  287. # (rows of consistent data) / (number of rows) = 100%
  288. consistency = 1.0
  289. # minimum consistency threshold
  290. threshold = 0.9
  291. while len(delims) == 0 and consistency >= threshold:
  292. for k, v in modeList:
  293. if v[0] > 0 and v[1] > 0:
  294. if ((v[1]/total) >= consistency and
  295. (delimiters is None or k in delimiters)):
  296. delims[k] = v
  297. consistency -= 0.01
  298. if len(delims) == 1:
  299. delim = delims.keys()[0]
  300. skipinitialspace = (data[0].count(delim) ==
  301. data[0].count("%c " % delim))
  302. return (delim, skipinitialspace)
  303. # analyze another chunkLength lines
  304. start = end
  305. end += chunkLength
  306. if not delims:
  307. return ('', 0)
  308. # if there's more than one, fall back to a 'preferred' list
  309. if len(delims) > 1:
  310. for d in self.preferred:
  311. if d in delims.keys():
  312. skipinitialspace = (data[0].count(d) ==
  313. data[0].count("%c " % d))
  314. return (d, skipinitialspace)
  315. # nothing else indicates a preference, pick the character that
  316. # dominates(?)
  317. items = [(v,k) for (k,v) in delims.items()]
  318. items.sort()
  319. delim = items[-1][1]
  320. skipinitialspace = (data[0].count(delim) ==
  321. data[0].count("%c " % delim))
  322. return (delim, skipinitialspace)
  323. def has_header(self, sample):
  324. # Creates a dictionary of types of data in each column. If any
  325. # column is of a single type (say, integers), *except* for the first
  326. # row, then the first row is presumed to be labels. If the type
  327. # can't be determined, it is assumed to be a string in which case
  328. # the length of the string is the determining factor: if all of the
  329. # rows except for the first are the same length, it's a header.
  330. # Finally, a 'vote' is taken at the end for each column, adding or
  331. # subtracting from the likelihood of the first row being a header.
  332. rdr = reader(StringIO(sample), self.sniff(sample))
  333. header = rdr.next() # assume first row is header
  334. columns = len(header)
  335. columnTypes = {}
  336. for i in range(columns): columnTypes[i] = None
  337. checked = 0
  338. for row in rdr:
  339. # arbitrary number of rows to check, to keep it sane
  340. if checked > 20:
  341. break
  342. checked += 1
  343. if len(row) != columns:
  344. continue # skip rows that have irregular number of columns
  345. for col in columnTypes.keys():
  346. for thisType in [int, long, float, complex]:
  347. try:
  348. thisType(row[col])
  349. break
  350. except (ValueError, OverflowError):
  351. pass
  352. else:
  353. # fallback to length of string
  354. thisType = len(row[col])
  355. # treat longs as ints
  356. if thisType == long:
  357. thisType = int
  358. if thisType != columnTypes[col]:
  359. if columnTypes[col] is None: # add new column type
  360. columnTypes[col] = thisType
  361. else:
  362. # type is inconsistent, remove column from
  363. # consideration
  364. del columnTypes[col]
  365. # finally, compare results against first row and "vote"
  366. # on whether it's a header
  367. hasHeader = 0
  368. for col, colType in columnTypes.items():
  369. if type(colType) == type(0): # it's a length
  370. if len(header[col]) != colType:
  371. hasHeader += 1
  372. else:
  373. hasHeader -= 1
  374. else: # attempt typecast
  375. try:
  376. colType(header[col])
  377. except (ValueError, TypeError):
  378. hasHeader += 1
  379. else:
  380. hasHeader -= 1
  381. return hasHeader > 0