PageRenderTime 59ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/scipy/io/arff/arffread.py

https://github.com/jm2201/scipy
Python | 712 lines | 639 code | 22 blank | 51 comment | 15 complexity | 70f423875954d1d56610424faf68afdf MD5 | raw file
Possible License(s): BSD-3-Clause, JSON
  1. #! /usr/bin/env python
  2. # Last Change: Mon Aug 20 08:00 PM 2007 J
  3. from __future__ import division, print_function, absolute_import
  4. import re
  5. import itertools
  6. import numpy as np
  7. from scipy.io.arff.utils import partial
  8. from scipy.lib.six import next
  9. """A module to read arff files."""
  10. __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
  11. # An Arff file is basically two parts:
  12. # - header
  13. # - data
  14. #
  15. # A header has each of its components starting by @META where META is one of
  16. # the keyword (attribute of relation, for now).
  17. # TODO:
  18. # - both integer and reals are treated as numeric -> the integer info is lost !
  19. # - Replace ValueError by ParseError or something
  20. # We know can handle the following:
  21. # - numeric and nominal attributes
  22. # - missing values for numeric attributes
  23. r_meta = re.compile('^\s*@')
  24. # Match a comment
  25. r_comment = re.compile(r'^%')
  26. # Match an empty line
  27. r_empty = re.compile(r'^\s+$')
  28. # Match a header line, that is a line which starts by @ + a word
  29. r_headerline = re.compile(r'^@\S*')
  30. r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
  31. r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
  32. r_attribute = re.compile(r'^@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
  33. # To get attributes name enclosed with ''
  34. r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
  35. # To get attributes name enclosed with '', possibly spread across multilines
  36. r_mcomattrval = re.compile(r"'([..\n]+)'\s+(..+$)")
  37. # To get normal attributes
  38. r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
  39. #-------------------------
  40. # Module defined exception
  41. #-------------------------
  42. class ArffError(IOError):
  43. pass
  44. class ParseArffError(ArffError):
  45. pass
  46. #------------------
  47. # Various utilities
  48. #------------------
  49. # An attribute is defined as @attribute name value
  50. def parse_type(attrtype):
  51. """Given an arff attribute value (meta data), returns its type.
  52. Expect the value to be a name."""
  53. uattribute = attrtype.lower().strip()
  54. if uattribute[0] == '{':
  55. return 'nominal'
  56. elif uattribute[:len('real')] == 'real':
  57. return 'numeric'
  58. elif uattribute[:len('integer')] == 'integer':
  59. return 'numeric'
  60. elif uattribute[:len('numeric')] == 'numeric':
  61. return 'numeric'
  62. elif uattribute[:len('string')] == 'string':
  63. return 'string'
  64. elif uattribute[:len('relational')] == 'relational':
  65. return 'relational'
  66. else:
  67. raise ParseArffError("unknown attribute %s" % uattribute)
  68. def get_nominal(attribute):
  69. """If attribute is nominal, returns a list of the values"""
  70. return attribute.split(',')
  71. def read_data_list(ofile):
  72. """Read each line of the iterable and put it in a list."""
  73. data = [next(ofile)]
  74. if data[0].strip()[0] == '{':
  75. raise ValueError("This looks like a sparse ARFF: not supported yet")
  76. data.extend([i for i in ofile])
  77. return data
  78. def get_ndata(ofile):
  79. """Read the whole file to get number of data attributes."""
  80. data = [next(ofile)]
  81. loc = 1
  82. if data[0].strip()[0] == '{':
  83. raise ValueError("This looks like a sparse ARFF: not supported yet")
  84. for i in ofile:
  85. loc += 1
  86. return loc
  87. def maxnomlen(atrv):
  88. """Given a string containing a nominal type definition, returns the
  89. string len of the biggest component.
  90. A nominal type is defined as seomthing framed between brace ({}).
  91. Parameters
  92. ----------
  93. atrv : str
  94. Nominal type definition
  95. Returns
  96. -------
  97. slen : int
  98. length of longest component
  99. Examples
  100. --------
  101. maxnomlen("{floup, bouga, fl, ratata}") returns 6 (the size of
  102. ratata, the longest nominal value).
  103. >>> maxnomlen("{floup, bouga, fl, ratata}")
  104. 6
  105. """
  106. nomtp = get_nom_val(atrv)
  107. return max(len(i) for i in nomtp)
  108. def get_nom_val(atrv):
  109. """Given a string containing a nominal type, returns a tuple of the
  110. possible values.
  111. A nominal type is defined as something framed between braces ({}).
  112. Parameters
  113. ----------
  114. atrv : str
  115. Nominal type definition
  116. Returns
  117. -------
  118. poss_vals : tuple
  119. possible values
  120. Examples
  121. --------
  122. >>> get_nom_val("{floup, bouga, fl, ratata}")
  123. ('floup', 'bouga', 'fl', 'ratata')
  124. """
  125. r_nominal = re.compile('{(..+)}')
  126. m = r_nominal.match(atrv)
  127. if m:
  128. return tuple(i.strip() for i in m.group(1).split(','))
  129. else:
  130. raise ValueError("This does not look like a nominal string")
  131. def go_data(ofile):
  132. """Skip header.
  133. the first next() call of the returned iterator will be the @data line"""
  134. return itertools.dropwhile(lambda x : not r_datameta.match(x), ofile)
  135. #----------------
  136. # Parsing header
  137. #----------------
  138. def tokenize_attribute(iterable, attribute):
  139. """Parse a raw string in header (eg starts by @attribute).
  140. Given a raw string attribute, try to get the name and type of the
  141. attribute. Constraints:
  142. * The first line must start with @attribute (case insensitive, and
  143. space like characters before @attribute are allowed)
  144. * Works also if the attribute is spread on multilines.
  145. * Works if empty lines or comments are in between
  146. Parameters
  147. ----------
  148. attribute : str
  149. the attribute string.
  150. Returns
  151. -------
  152. name : str
  153. name of the attribute
  154. value : str
  155. value of the attribute
  156. next : str
  157. next line to be parsed
  158. Examples
  159. --------
  160. If attribute is a string defined in python as r"floupi real", will
  161. return floupi as name, and real as value.
  162. >>> iterable = iter([0] * 10) # dummy iterator
  163. >>> tokenize_attribute(iterable, r"@attribute floupi real")
  164. ('floupi', 'real', 0)
  165. If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
  166. and real as value.
  167. >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
  168. ('floupi 2', 'real', 0)
  169. """
  170. sattr = attribute.strip()
  171. mattr = r_attribute.match(sattr)
  172. if mattr:
  173. # atrv is everything after @attribute
  174. atrv = mattr.group(1)
  175. if r_comattrval.match(atrv):
  176. name, type = tokenize_single_comma(atrv)
  177. next_item = next(iterable)
  178. elif r_wcomattrval.match(atrv):
  179. name, type = tokenize_single_wcomma(atrv)
  180. next_item = next(iterable)
  181. else:
  182. # Not sure we should support this, as it does not seem supported by
  183. # weka.
  184. raise ValueError("multi line not supported yet")
  185. #name, type, next_item = tokenize_multilines(iterable, atrv)
  186. else:
  187. raise ValueError("First line unparsable: %s" % sattr)
  188. if type == 'relational':
  189. raise ValueError("relational attributes not supported yet")
  190. return name, type, next_item
  191. def tokenize_multilines(iterable, val):
  192. """Can tokenize an attribute spread over several lines."""
  193. # If one line does not match, read all the following lines up to next
  194. # line with meta character, and try to parse everything up to there.
  195. if not r_mcomattrval.match(val):
  196. all = [val]
  197. i = next(iterable)
  198. while not r_meta.match(i):
  199. all.append(i)
  200. i = next(iterable)
  201. if r_mend.search(i):
  202. raise ValueError("relational attribute not supported yet")
  203. print("".join(all[:-1]))
  204. m = r_comattrval.match("".join(all[:-1]))
  205. return m.group(1), m.group(2), i
  206. else:
  207. raise ValueError("Cannot parse attribute names spread over multi "\
  208. "lines yet")
  209. def tokenize_single_comma(val):
  210. # XXX we match twice the same string (here and at the caller level). It is
  211. # stupid, but it is easier for now...
  212. m = r_comattrval.match(val)
  213. if m:
  214. try:
  215. name = m.group(1).strip()
  216. type = m.group(2).strip()
  217. except IndexError:
  218. raise ValueError("Error while tokenizing attribute")
  219. else:
  220. raise ValueError("Error while tokenizing single %s" % val)
  221. return name, type
  222. def tokenize_single_wcomma(val):
  223. # XXX we match twice the same string (here and at the caller level). It is
  224. # stupid, but it is easier for now...
  225. m = r_wcomattrval.match(val)
  226. if m:
  227. try:
  228. name = m.group(1).strip()
  229. type = m.group(2).strip()
  230. except IndexError:
  231. raise ValueError("Error while tokenizing attribute")
  232. else:
  233. raise ValueError("Error while tokenizing single %s" % val)
  234. return name, type
  235. def read_header(ofile):
  236. """Read the header of the iterable ofile."""
  237. i = next(ofile)
  238. # Pass first comments
  239. while r_comment.match(i):
  240. i = next(ofile)
  241. # Header is everything up to DATA attribute ?
  242. relation = None
  243. attributes = []
  244. while not r_datameta.match(i):
  245. m = r_headerline.match(i)
  246. if m:
  247. isattr = r_attribute.match(i)
  248. if isattr:
  249. name, type, i = tokenize_attribute(ofile, i)
  250. attributes.append((name, type))
  251. else:
  252. isrel = r_relation.match(i)
  253. if isrel:
  254. relation = isrel.group(1)
  255. else:
  256. raise ValueError("Error parsing line %s" % i)
  257. i = next(ofile)
  258. else:
  259. i = next(ofile)
  260. return relation, attributes
  261. #--------------------
  262. # Parsing actual data
  263. #--------------------
  264. def safe_float(x):
  265. """given a string x, convert it to a float. If the stripped string is a ?,
  266. return a Nan (missing value).
  267. Parameters
  268. ----------
  269. x : str
  270. string to convert
  271. Returns
  272. -------
  273. f : float
  274. where float can be nan
  275. Examples
  276. --------
  277. >>> safe_float('1')
  278. 1.0
  279. >>> safe_float('1\\n')
  280. 1.0
  281. >>> safe_float('?\\n')
  282. nan
  283. """
  284. if '?' in x:
  285. return np.nan
  286. else:
  287. return np.float(x)
  288. def safe_nominal(value, pvalue):
  289. svalue = value.strip()
  290. if svalue in pvalue:
  291. return svalue
  292. elif svalue == '?':
  293. return svalue
  294. else:
  295. raise ValueError("%s value not in %s" % (str(svalue), str(pvalue)))
  296. def get_delim(line):
  297. """Given a string representing a line of data, check whether the
  298. delimiter is ',' or space.
  299. Parameters
  300. ----------
  301. line : str
  302. line of data
  303. Returns
  304. -------
  305. delim : {',', ' '}
  306. Examples
  307. --------
  308. >>> get_delim(',')
  309. ','
  310. >>> get_delim(' ')
  311. ' '
  312. >>> get_delim(', ')
  313. ','
  314. >>> get_delim('x')
  315. Traceback (most recent call last):
  316. ...
  317. ValueError: delimiter not understood: x
  318. """
  319. if ',' in line:
  320. return ','
  321. if ' ' in line:
  322. return ' '
  323. raise ValueError("delimiter not understood: " + line)
  324. class MetaData(object):
  325. """Small container to keep useful informations on a ARFF dataset.
  326. Knows about attributes names and types.
  327. Examples
  328. --------
  329. data, meta = loadarff('iris.arff')
  330. # This will print the attributes names of the iris.arff dataset
  331. for i in meta:
  332. print i
  333. # This works too
  334. meta.names()
  335. # Getting attribute type
  336. types = meta.types()
  337. Notes
  338. -----
  339. Also maintains the list of attributes in order, i.e. doing for i in
  340. meta, where meta is an instance of MetaData, will return the
  341. different attribute names in the order they were defined.
  342. """
  343. def __init__(self, rel, attr):
  344. self.name = rel
  345. # We need the dictionary to be ordered
  346. # XXX: may be better to implement an ordered dictionary
  347. self._attributes = {}
  348. self._attrnames = []
  349. for name, value in attr:
  350. tp = parse_type(value)
  351. self._attrnames.append(name)
  352. if tp == 'nominal':
  353. self._attributes[name] = (tp, get_nom_val(value))
  354. else:
  355. self._attributes[name] = (tp, None)
  356. def __repr__(self):
  357. msg = ""
  358. msg += "Dataset: %s\n" % self.name
  359. for i in self._attrnames:
  360. msg += "\t%s's type is %s" % (i, self._attributes[i][0])
  361. if self._attributes[i][1]:
  362. msg += ", range is %s" % str(self._attributes[i][1])
  363. msg += '\n'
  364. return msg
  365. def __iter__(self):
  366. return iter(self._attrnames)
  367. def __getitem__(self, key):
  368. return self._attributes[key]
  369. def names(self):
  370. """Return the list of attribute names."""
  371. return self._attrnames
  372. def types(self):
  373. """Return the list of attribute types."""
  374. attr_types = [self._attributes[name][0] for name in self._attrnames]
  375. return attr_types
  376. def loadarff(f):
  377. """
  378. Read an arff file.
  379. The data is returned as a record array, which can be accessed much like
  380. a dictionary of numpy arrays. For example, if one of the attributes is
  381. called 'pressure', then its first 10 data points can be accessed from the
  382. ``data`` record array like so: ``data['pressure'][0:10]``
  383. Parameters
  384. ----------
  385. f : file-like or str
  386. File-like object to read from, or filename to open.
  387. Returns
  388. -------
  389. data : record array
  390. The data of the arff file, accessible by attribute names.
  391. meta : `MetaData`
  392. Contains information about the arff file such as name and
  393. type of attributes, the relation (name of the dataset), etc...
  394. Raises
  395. ------
  396. `ParseArffError`
  397. This is raised if the given file is not ARFF-formatted.
  398. NotImplementedError
  399. The ARFF file has an attribute which is not supported yet.
  400. Notes
  401. -----
  402. This function should be able to read most arff files. Not
  403. implemented functionality include:
  404. * date type attributes
  405. * string type attributes
  406. It can read files with numeric and nominal attributes. It cannot read
  407. files with sparse data ({} in the file). However, this function can
  408. read files with missing data (? in the file), representing the data
  409. points as NaNs.
  410. """
  411. if hasattr(f, 'read'):
  412. ofile = f
  413. else:
  414. ofile = open(f, 'rt')
  415. try:
  416. return _loadarff(ofile)
  417. finally:
  418. if ofile is not f: # only close what we opened
  419. ofile.close()
  420. def _loadarff(ofile):
  421. # Parse the header file
  422. try:
  423. rel, attr = read_header(ofile)
  424. except ValueError as e:
  425. msg = "Error while parsing header, error was: " + str(e)
  426. raise ParseArffError(msg)
  427. # Check whether we have a string attribute (not supported yet)
  428. hasstr = False
  429. for name, value in attr:
  430. type = parse_type(value)
  431. if type == 'string':
  432. hasstr = True
  433. meta = MetaData(rel, attr)
  434. # XXX The following code is not great
  435. # Build the type descriptor descr and the list of convertors to convert
  436. # each attribute to the suitable type (which should match the one in
  437. # descr).
  438. # This can be used once we want to support integer as integer values and
  439. # not as numeric anymore (using masked arrays ?).
  440. acls2dtype = {'real' : np.float, 'integer' : np.float, 'numeric' : np.float}
  441. acls2conv = {'real' : safe_float, 'integer' : safe_float, 'numeric' : safe_float}
  442. descr = []
  443. convertors = []
  444. if not hasstr:
  445. for name, value in attr:
  446. type = parse_type(value)
  447. if type == 'date':
  448. raise ValueError("date type not supported yet, sorry")
  449. elif type == 'nominal':
  450. n = maxnomlen(value)
  451. descr.append((name, 'S%d' % n))
  452. pvalue = get_nom_val(value)
  453. convertors.append(partial(safe_nominal, pvalue = pvalue))
  454. else:
  455. descr.append((name, acls2dtype[type]))
  456. convertors.append(safe_float)
  457. #dc.append(acls2conv[type])
  458. #sdescr.append((name, acls2sdtype[type]))
  459. else:
  460. # How to support string efficiently ? Ideally, we should know the max
  461. # size of the string before allocating the numpy array.
  462. raise NotImplementedError("String attributes not supported yet, sorry")
  463. ni = len(convertors)
  464. # Get the delimiter from the first line of data:
  465. def next_data_line(row_iter):
  466. """Assumes we are already in the data part (eg after @data)."""
  467. raw = next(row_iter)
  468. while r_empty.match(raw):
  469. raw = next(row_iter)
  470. while r_comment.match(raw):
  471. raw = next(row_iter)
  472. return raw
  473. try:
  474. try:
  475. dtline = next_data_line(ofile)
  476. delim = get_delim(dtline)
  477. except ValueError as e:
  478. raise ParseArffError("Error while parsing delimiter: " + str(e))
  479. finally:
  480. ofile.seek(0, 0)
  481. ofile = go_data(ofile)
  482. # skip the @data line
  483. next(ofile)
  484. def generator(row_iter, delim = ','):
  485. # TODO: this is where we are spending times (~80%). I think things
  486. # could be made more efficiently:
  487. # - We could for example "compile" the function, because some values
  488. # do not change here.
  489. # - The function to convert a line to dtyped values could also be
  490. # generated on the fly from a string and be executed instead of
  491. # looping.
  492. # - The regex are overkill: for comments, checking that a line starts
  493. # by % should be enough and faster, and for empty lines, same thing
  494. # --> this does not seem to change anything.
  495. # We do not abstract skipping comments and empty lines for performances
  496. # reason.
  497. raw = next(row_iter)
  498. while r_empty.match(raw):
  499. raw = next(row_iter)
  500. while r_comment.match(raw):
  501. raw = next(row_iter)
  502. # 'compiling' the range since it does not change
  503. # Note, I have already tried zipping the converters and
  504. # row elements and got slightly worse performance.
  505. elems = list(range(ni))
  506. row = raw.split(delim)
  507. yield tuple([convertors[i](row[i]) for i in elems])
  508. for raw in row_iter:
  509. while r_comment.match(raw):
  510. raw = next(row_iter)
  511. while r_empty.match(raw):
  512. raw = next(row_iter)
  513. row = raw.split(delim)
  514. yield tuple([convertors[i](row[i]) for i in elems])
  515. a = generator(ofile, delim = delim)
  516. # No error should happen here: it is a bug otherwise
  517. data = np.fromiter(a, descr)
  518. return data, meta
  519. #-----
  520. # Misc
  521. #-----
  522. def basic_stats(data):
  523. nbfac = data.size * 1. / (data.size - 1)
  524. return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
  525. def print_attribute(name, tp, data):
  526. type = tp[0]
  527. if type == 'numeric' or type == 'real' or type == 'integer':
  528. min, max, mean, std = basic_stats(data)
  529. print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
  530. else:
  531. msg = name + ",{"
  532. for i in range(len(tp[1])-1):
  533. msg += tp[1][i] + ","
  534. msg += tp[1][-1]
  535. msg += "}"
  536. print(msg)
  537. def test_weka(filename):
  538. data, meta = loadarff(filename)
  539. print(len(data.dtype))
  540. print(data.size)
  541. for i in meta:
  542. print_attribute(i,meta[i],data[i])
  543. # make sure nose does not find this as a test
  544. test_weka.__test__ = False
  545. def floupi(filename):
  546. data, meta = loadarff(filename)
  547. from attrselect import print_dataset_info
  548. print_dataset_info(data)
  549. print("relation %s, has %d instances" % (meta.name, data.size))
  550. itp = iter(types)
  551. for i in data.dtype.names:
  552. print_attribute(i,next(itp),data[i])
  553. #tp = itp.next()
  554. #if tp == 'numeric' or tp == 'real' or tp == 'integer':
  555. # min, max, mean, std = basic_stats(data[i])
  556. # print "\tinstance %s: min %f, max %f, mean %f, std %f" % \
  557. # (i, min, max, mean, std)
  558. #else:
  559. # print "\tinstance %s is non numeric" % i
  560. if __name__ == '__main__':
  561. #import glob
  562. #for i in glob.glob('arff.bak/data/*'):
  563. # relation, attributes = read_header(open(i))
  564. # print "Parsing header of %s: relation %s, %d attributes" % (i,
  565. # relation, len(attributes))
  566. import sys
  567. filename = sys.argv[1]
  568. #filename = 'arff.bak/data/pharynx.arff'
  569. #floupi(filename)
  570. test_weka(filename)
  571. #gf = []
  572. #wf = []
  573. #for i in glob.glob('arff.bak/data/*'):
  574. # try:
  575. # print "=============== reading %s ======================" % i
  576. # floupi(i)
  577. # gf.append(i)
  578. # except ValueError, e:
  579. # print "!!!! Error parsing the file !!!!!"
  580. # print e
  581. # wf.append(i)
  582. # except IndexError, e:
  583. # print "!!!! Error parsing the file !!!!!"
  584. # print e
  585. # wf.append(i)
  586. # except ArffError, e:
  587. # print "!!!! Error parsing the file !!!!!"
  588. # print e
  589. # wf.append(i)
  590. #print "%d good files" % len(gf)
  591. #print "%d bad files" % len(wf)