/scipy/io/arff/arffread.py
Python | 712 lines | 639 code | 22 blank | 51 comment | 15 complexity | 70f423875954d1d56610424faf68afdf MD5 | raw file
Possible License(s): BSD-3-Clause, JSON
- #! /usr/bin/env python
- # Last Change: Mon Aug 20 08:00 PM 2007 J
- from __future__ import division, print_function, absolute_import
- import re
- import itertools
- import numpy as np
- from scipy.io.arff.utils import partial
- from scipy.lib.six import next
- """A module to read arff files."""
- __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
- # An Arff file is basically two parts:
- # - header
- # - data
- #
- # A header has each of its components starting by @META where META is one of
- # the keyword (attribute of relation, for now).
- # TODO:
- # - both integer and reals are treated as numeric -> the integer info is lost !
- # - Replace ValueError by ParseError or something
- # We know can handle the following:
- # - numeric and nominal attributes
- # - missing values for numeric attributes
- r_meta = re.compile('^\s*@')
- # Match a comment
- r_comment = re.compile(r'^%')
- # Match an empty line
- r_empty = re.compile(r'^\s+$')
- # Match a header line, that is a line which starts by @ + a word
- r_headerline = re.compile(r'^@\S*')
- r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
- r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
- r_attribute = re.compile(r'^@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
- # To get attributes name enclosed with ''
- r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
- # To get attributes name enclosed with '', possibly spread across multilines
- r_mcomattrval = re.compile(r"'([..\n]+)'\s+(..+$)")
- # To get normal attributes
- r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
- #-------------------------
- # Module defined exception
- #-------------------------
- class ArffError(IOError):
- pass
- class ParseArffError(ArffError):
- pass
- #------------------
- # Various utilities
- #------------------
- # An attribute is defined as @attribute name value
- def parse_type(attrtype):
- """Given an arff attribute value (meta data), returns its type.
- Expect the value to be a name."""
- uattribute = attrtype.lower().strip()
- if uattribute[0] == '{':
- return 'nominal'
- elif uattribute[:len('real')] == 'real':
- return 'numeric'
- elif uattribute[:len('integer')] == 'integer':
- return 'numeric'
- elif uattribute[:len('numeric')] == 'numeric':
- return 'numeric'
- elif uattribute[:len('string')] == 'string':
- return 'string'
- elif uattribute[:len('relational')] == 'relational':
- return 'relational'
- else:
- raise ParseArffError("unknown attribute %s" % uattribute)
- def get_nominal(attribute):
- """If attribute is nominal, returns a list of the values"""
- return attribute.split(',')
- def read_data_list(ofile):
- """Read each line of the iterable and put it in a list."""
- data = [next(ofile)]
- if data[0].strip()[0] == '{':
- raise ValueError("This looks like a sparse ARFF: not supported yet")
- data.extend([i for i in ofile])
- return data
- def get_ndata(ofile):
- """Read the whole file to get number of data attributes."""
- data = [next(ofile)]
- loc = 1
- if data[0].strip()[0] == '{':
- raise ValueError("This looks like a sparse ARFF: not supported yet")
- for i in ofile:
- loc += 1
- return loc
- def maxnomlen(atrv):
- """Given a string containing a nominal type definition, returns the
- string len of the biggest component.
- A nominal type is defined as seomthing framed between brace ({}).
- Parameters
- ----------
- atrv : str
- Nominal type definition
- Returns
- -------
- slen : int
- length of longest component
- Examples
- --------
- maxnomlen("{floup, bouga, fl, ratata}") returns 6 (the size of
- ratata, the longest nominal value).
- >>> maxnomlen("{floup, bouga, fl, ratata}")
- 6
- """
- nomtp = get_nom_val(atrv)
- return max(len(i) for i in nomtp)
- def get_nom_val(atrv):
- """Given a string containing a nominal type, returns a tuple of the
- possible values.
- A nominal type is defined as something framed between braces ({}).
- Parameters
- ----------
- atrv : str
- Nominal type definition
- Returns
- -------
- poss_vals : tuple
- possible values
- Examples
- --------
- >>> get_nom_val("{floup, bouga, fl, ratata}")
- ('floup', 'bouga', 'fl', 'ratata')
- """
- r_nominal = re.compile('{(..+)}')
- m = r_nominal.match(atrv)
- if m:
- return tuple(i.strip() for i in m.group(1).split(','))
- else:
- raise ValueError("This does not look like a nominal string")
- def go_data(ofile):
- """Skip header.
- the first next() call of the returned iterator will be the @data line"""
- return itertools.dropwhile(lambda x : not r_datameta.match(x), ofile)
- #----------------
- # Parsing header
- #----------------
- def tokenize_attribute(iterable, attribute):
- """Parse a raw string in header (eg starts by @attribute).
- Given a raw string attribute, try to get the name and type of the
- attribute. Constraints:
- * The first line must start with @attribute (case insensitive, and
- space like characters before @attribute are allowed)
- * Works also if the attribute is spread on multilines.
- * Works if empty lines or comments are in between
- Parameters
- ----------
- attribute : str
- the attribute string.
- Returns
- -------
- name : str
- name of the attribute
- value : str
- value of the attribute
- next : str
- next line to be parsed
- Examples
- --------
- If attribute is a string defined in python as r"floupi real", will
- return floupi as name, and real as value.
- >>> iterable = iter([0] * 10) # dummy iterator
- >>> tokenize_attribute(iterable, r"@attribute floupi real")
- ('floupi', 'real', 0)
- If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
- and real as value.
- >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
- ('floupi 2', 'real', 0)
- """
- sattr = attribute.strip()
- mattr = r_attribute.match(sattr)
- if mattr:
- # atrv is everything after @attribute
- atrv = mattr.group(1)
- if r_comattrval.match(atrv):
- name, type = tokenize_single_comma(atrv)
- next_item = next(iterable)
- elif r_wcomattrval.match(atrv):
- name, type = tokenize_single_wcomma(atrv)
- next_item = next(iterable)
- else:
- # Not sure we should support this, as it does not seem supported by
- # weka.
- raise ValueError("multi line not supported yet")
- #name, type, next_item = tokenize_multilines(iterable, atrv)
- else:
- raise ValueError("First line unparsable: %s" % sattr)
- if type == 'relational':
- raise ValueError("relational attributes not supported yet")
- return name, type, next_item
- def tokenize_multilines(iterable, val):
- """Can tokenize an attribute spread over several lines."""
- # If one line does not match, read all the following lines up to next
- # line with meta character, and try to parse everything up to there.
- if not r_mcomattrval.match(val):
- all = [val]
- i = next(iterable)
- while not r_meta.match(i):
- all.append(i)
- i = next(iterable)
- if r_mend.search(i):
- raise ValueError("relational attribute not supported yet")
- print("".join(all[:-1]))
- m = r_comattrval.match("".join(all[:-1]))
- return m.group(1), m.group(2), i
- else:
- raise ValueError("Cannot parse attribute names spread over multi "\
- "lines yet")
- def tokenize_single_comma(val):
- # XXX we match twice the same string (here and at the caller level). It is
- # stupid, but it is easier for now...
- m = r_comattrval.match(val)
- if m:
- try:
- name = m.group(1).strip()
- type = m.group(2).strip()
- except IndexError:
- raise ValueError("Error while tokenizing attribute")
- else:
- raise ValueError("Error while tokenizing single %s" % val)
- return name, type
- def tokenize_single_wcomma(val):
- # XXX we match twice the same string (here and at the caller level). It is
- # stupid, but it is easier for now...
- m = r_wcomattrval.match(val)
- if m:
- try:
- name = m.group(1).strip()
- type = m.group(2).strip()
- except IndexError:
- raise ValueError("Error while tokenizing attribute")
- else:
- raise ValueError("Error while tokenizing single %s" % val)
- return name, type
- def read_header(ofile):
- """Read the header of the iterable ofile."""
- i = next(ofile)
- # Pass first comments
- while r_comment.match(i):
- i = next(ofile)
- # Header is everything up to DATA attribute ?
- relation = None
- attributes = []
- while not r_datameta.match(i):
- m = r_headerline.match(i)
- if m:
- isattr = r_attribute.match(i)
- if isattr:
- name, type, i = tokenize_attribute(ofile, i)
- attributes.append((name, type))
- else:
- isrel = r_relation.match(i)
- if isrel:
- relation = isrel.group(1)
- else:
- raise ValueError("Error parsing line %s" % i)
- i = next(ofile)
- else:
- i = next(ofile)
- return relation, attributes
- #--------------------
- # Parsing actual data
- #--------------------
- def safe_float(x):
- """given a string x, convert it to a float. If the stripped string is a ?,
- return a Nan (missing value).
- Parameters
- ----------
- x : str
- string to convert
- Returns
- -------
- f : float
- where float can be nan
- Examples
- --------
- >>> safe_float('1')
- 1.0
- >>> safe_float('1\\n')
- 1.0
- >>> safe_float('?\\n')
- nan
- """
- if '?' in x:
- return np.nan
- else:
- return np.float(x)
- def safe_nominal(value, pvalue):
- svalue = value.strip()
- if svalue in pvalue:
- return svalue
- elif svalue == '?':
- return svalue
- else:
- raise ValueError("%s value not in %s" % (str(svalue), str(pvalue)))
- def get_delim(line):
- """Given a string representing a line of data, check whether the
- delimiter is ',' or space.
- Parameters
- ----------
- line : str
- line of data
- Returns
- -------
- delim : {',', ' '}
- Examples
- --------
- >>> get_delim(',')
- ','
- >>> get_delim(' ')
- ' '
- >>> get_delim(', ')
- ','
- >>> get_delim('x')
- Traceback (most recent call last):
- ...
- ValueError: delimiter not understood: x
- """
- if ',' in line:
- return ','
- if ' ' in line:
- return ' '
- raise ValueError("delimiter not understood: " + line)
- class MetaData(object):
- """Small container to keep useful informations on a ARFF dataset.
- Knows about attributes names and types.
- Examples
- --------
- data, meta = loadarff('iris.arff')
- # This will print the attributes names of the iris.arff dataset
- for i in meta:
- print i
- # This works too
- meta.names()
- # Getting attribute type
- types = meta.types()
- Notes
- -----
- Also maintains the list of attributes in order, i.e. doing for i in
- meta, where meta is an instance of MetaData, will return the
- different attribute names in the order they were defined.
- """
- def __init__(self, rel, attr):
- self.name = rel
- # We need the dictionary to be ordered
- # XXX: may be better to implement an ordered dictionary
- self._attributes = {}
- self._attrnames = []
- for name, value in attr:
- tp = parse_type(value)
- self._attrnames.append(name)
- if tp == 'nominal':
- self._attributes[name] = (tp, get_nom_val(value))
- else:
- self._attributes[name] = (tp, None)
- def __repr__(self):
- msg = ""
- msg += "Dataset: %s\n" % self.name
- for i in self._attrnames:
- msg += "\t%s's type is %s" % (i, self._attributes[i][0])
- if self._attributes[i][1]:
- msg += ", range is %s" % str(self._attributes[i][1])
- msg += '\n'
- return msg
- def __iter__(self):
- return iter(self._attrnames)
- def __getitem__(self, key):
- return self._attributes[key]
- def names(self):
- """Return the list of attribute names."""
- return self._attrnames
- def types(self):
- """Return the list of attribute types."""
- attr_types = [self._attributes[name][0] for name in self._attrnames]
- return attr_types
- def loadarff(f):
- """
- Read an arff file.
- The data is returned as a record array, which can be accessed much like
- a dictionary of numpy arrays. For example, if one of the attributes is
- called 'pressure', then its first 10 data points can be accessed from the
- ``data`` record array like so: ``data['pressure'][0:10]``
- Parameters
- ----------
- f : file-like or str
- File-like object to read from, or filename to open.
- Returns
- -------
- data : record array
- The data of the arff file, accessible by attribute names.
- meta : `MetaData`
- Contains information about the arff file such as name and
- type of attributes, the relation (name of the dataset), etc...
- Raises
- ------
- `ParseArffError`
- This is raised if the given file is not ARFF-formatted.
- NotImplementedError
- The ARFF file has an attribute which is not supported yet.
- Notes
- -----
- This function should be able to read most arff files. Not
- implemented functionality include:
- * date type attributes
- * string type attributes
- It can read files with numeric and nominal attributes. It cannot read
- files with sparse data ({} in the file). However, this function can
- read files with missing data (? in the file), representing the data
- points as NaNs.
- """
- if hasattr(f, 'read'):
- ofile = f
- else:
- ofile = open(f, 'rt')
- try:
- return _loadarff(ofile)
- finally:
- if ofile is not f: # only close what we opened
- ofile.close()
- def _loadarff(ofile):
- # Parse the header file
- try:
- rel, attr = read_header(ofile)
- except ValueError as e:
- msg = "Error while parsing header, error was: " + str(e)
- raise ParseArffError(msg)
- # Check whether we have a string attribute (not supported yet)
- hasstr = False
- for name, value in attr:
- type = parse_type(value)
- if type == 'string':
- hasstr = True
- meta = MetaData(rel, attr)
- # XXX The following code is not great
- # Build the type descriptor descr and the list of convertors to convert
- # each attribute to the suitable type (which should match the one in
- # descr).
- # This can be used once we want to support integer as integer values and
- # not as numeric anymore (using masked arrays ?).
- acls2dtype = {'real' : np.float, 'integer' : np.float, 'numeric' : np.float}
- acls2conv = {'real' : safe_float, 'integer' : safe_float, 'numeric' : safe_float}
- descr = []
- convertors = []
- if not hasstr:
- for name, value in attr:
- type = parse_type(value)
- if type == 'date':
- raise ValueError("date type not supported yet, sorry")
- elif type == 'nominal':
- n = maxnomlen(value)
- descr.append((name, 'S%d' % n))
- pvalue = get_nom_val(value)
- convertors.append(partial(safe_nominal, pvalue = pvalue))
- else:
- descr.append((name, acls2dtype[type]))
- convertors.append(safe_float)
- #dc.append(acls2conv[type])
- #sdescr.append((name, acls2sdtype[type]))
- else:
- # How to support string efficiently ? Ideally, we should know the max
- # size of the string before allocating the numpy array.
- raise NotImplementedError("String attributes not supported yet, sorry")
- ni = len(convertors)
- # Get the delimiter from the first line of data:
- def next_data_line(row_iter):
- """Assumes we are already in the data part (eg after @data)."""
- raw = next(row_iter)
- while r_empty.match(raw):
- raw = next(row_iter)
- while r_comment.match(raw):
- raw = next(row_iter)
- return raw
- try:
- try:
- dtline = next_data_line(ofile)
- delim = get_delim(dtline)
- except ValueError as e:
- raise ParseArffError("Error while parsing delimiter: " + str(e))
- finally:
- ofile.seek(0, 0)
- ofile = go_data(ofile)
- # skip the @data line
- next(ofile)
- def generator(row_iter, delim = ','):
- # TODO: this is where we are spending times (~80%). I think things
- # could be made more efficiently:
- # - We could for example "compile" the function, because some values
- # do not change here.
- # - The function to convert a line to dtyped values could also be
- # generated on the fly from a string and be executed instead of
- # looping.
- # - The regex are overkill: for comments, checking that a line starts
- # by % should be enough and faster, and for empty lines, same thing
- # --> this does not seem to change anything.
- # We do not abstract skipping comments and empty lines for performances
- # reason.
- raw = next(row_iter)
- while r_empty.match(raw):
- raw = next(row_iter)
- while r_comment.match(raw):
- raw = next(row_iter)
- # 'compiling' the range since it does not change
- # Note, I have already tried zipping the converters and
- # row elements and got slightly worse performance.
- elems = list(range(ni))
- row = raw.split(delim)
- yield tuple([convertors[i](row[i]) for i in elems])
- for raw in row_iter:
- while r_comment.match(raw):
- raw = next(row_iter)
- while r_empty.match(raw):
- raw = next(row_iter)
- row = raw.split(delim)
- yield tuple([convertors[i](row[i]) for i in elems])
- a = generator(ofile, delim = delim)
- # No error should happen here: it is a bug otherwise
- data = np.fromiter(a, descr)
- return data, meta
- #-----
- # Misc
- #-----
- def basic_stats(data):
- nbfac = data.size * 1. / (data.size - 1)
- return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
- def print_attribute(name, tp, data):
- type = tp[0]
- if type == 'numeric' or type == 'real' or type == 'integer':
- min, max, mean, std = basic_stats(data)
- print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
- else:
- msg = name + ",{"
- for i in range(len(tp[1])-1):
- msg += tp[1][i] + ","
- msg += tp[1][-1]
- msg += "}"
- print(msg)
- def test_weka(filename):
- data, meta = loadarff(filename)
- print(len(data.dtype))
- print(data.size)
- for i in meta:
- print_attribute(i,meta[i],data[i])
- # make sure nose does not find this as a test
- test_weka.__test__ = False
- def floupi(filename):
- data, meta = loadarff(filename)
- from attrselect import print_dataset_info
- print_dataset_info(data)
- print("relation %s, has %d instances" % (meta.name, data.size))
- itp = iter(types)
- for i in data.dtype.names:
- print_attribute(i,next(itp),data[i])
- #tp = itp.next()
- #if tp == 'numeric' or tp == 'real' or tp == 'integer':
- # min, max, mean, std = basic_stats(data[i])
- # print "\tinstance %s: min %f, max %f, mean %f, std %f" % \
- # (i, min, max, mean, std)
- #else:
- # print "\tinstance %s is non numeric" % i
- if __name__ == '__main__':
- #import glob
- #for i in glob.glob('arff.bak/data/*'):
- # relation, attributes = read_header(open(i))
- # print "Parsing header of %s: relation %s, %d attributes" % (i,
- # relation, len(attributes))
- import sys
- filename = sys.argv[1]
- #filename = 'arff.bak/data/pharynx.arff'
- #floupi(filename)
- test_weka(filename)
- #gf = []
- #wf = []
- #for i in glob.glob('arff.bak/data/*'):
- # try:
- # print "=============== reading %s ======================" % i
- # floupi(i)
- # gf.append(i)
- # except ValueError, e:
- # print "!!!! Error parsing the file !!!!!"
- # print e
- # wf.append(i)
- # except IndexError, e:
- # print "!!!! Error parsing the file !!!!!"
- # print e
- # wf.append(i)
- # except ArffError, e:
- # print "!!!! Error parsing the file !!!!!"
- # print e
- # wf.append(i)
- #print "%d good files" % len(gf)
- #print "%d bad files" % len(wf)