/PyML-0.7.9/PyML/containers/vectorDatasets.py

https://github.com/cathywu/Sentiment-Analysis
Python | 262 lines | 171 code | 55 blank | 36 comment | 66 complexity | 14dc1387578162a64e45b4c37f92ef25 MD5 | raw file
  1. import numpy
  2. from PyML.containers.baseDatasets import WrapperDataSet, BaseVectorDataSet
  3. from PyML.utils import arrayWrap,misc
  4. from ext import csparsedataset,cvectordataset
  5. class BaseCVectorDataSet (WrapperDataSet, BaseVectorDataSet) :
  6. """A base class for vector dataset containers implemented in C++"""
  7. def __init__(self) :
  8. if self.__class__.__name__ == 'SparseDataSet' :
  9. self.container = csparsedataset.SparseDataSet
  10. elif self.__class__.__name__ == 'VectorDataSet' :
  11. self.container = cvectordataset.VectorDataSet
  12. def copy(self, other, patterns, deepcopy) :
  13. """
  14. copy a wrapper dataset
  15. :Parameters:
  16. - `other` - the other dataset
  17. - `patternsToCopy` - a list of patterns to copy
  18. - `deepcopy` - a 0/1 flag telling whether to do deepcopy or not
  19. """
  20. if patterns is None :
  21. patterns = range(len(other))
  22. self.container.__init__(self, other, patterns)
  23. self.featureDict = other.featureDict.copy()
  24. self.featureID = other.featureID[:]
  25. def initializeDataMatrix(self, numPatterns, numFeatures) :
  26. self.container.__init__(self, numPatterns)
  27. def addPattern(self, x, i) :
  28. if type(x) == type({}) :
  29. keys,values = arrayWrap.dict2vectors(x)
  30. elif type(x) == type(numpy.array(1)) or type(x) == type([]) :
  31. keys = arrayWrap.longVector([])
  32. values = arrayWrap.doubleVector(x)
  33. else:
  34. raise TypeError,"data vectors must be dictionary, list or arrays"
  35. self.container.addPattern(self, keys, values)
  36. def addFeature(self, id, values) :
  37. """
  38. Add a feature to a dataset.
  39. :Parameters:
  40. - `id` - the id of the feature
  41. - `values` - list of values
  42. """
  43. if len(values) != self.size() :
  44. raise ValueError, \
  45. 'number of values provided does not match dataset size'
  46. if type(id) == type(1) :
  47. id = str(id)
  48. hashID = hash(id)
  49. if not hasattr(self, 'featureKeyDict') :
  50. self.addFeatureKeyDict()
  51. if hashID in self.featureKeyDict :
  52. raise ValueError, 'Feature already exists, or hash clash'
  53. if type(values) != type([]) :
  54. values = [v for v in values]
  55. self.container.addFeature(self, hashID, values)
  56. self.updateFeatureDict(id)
  57. def addFeatures(self, other) :
  58. """
  59. Add features to a dataset using the features in another dataset
  60. :Parameters:
  61. - `other` - the other dataset
  62. """
  63. if len(other) != len(self) :
  64. raise ValueError, 'number of examples does not match'
  65. if not hasattr(self, 'featureKeyDict') :
  66. self.addFeatureKeyDict()
  67. for id in other.featureID :
  68. if hash(id) in self.featureKeyDict :
  69. raise ValueError, 'Feature already exists, or hash clash'
  70. self.container.addFeatures(self, other)
  71. self.updateFeatureDict(other)
  72. def getPattern(self, i) :
  73. if i < 0 or i >= len(self) :
  74. raise ValueError, 'Index out of range'
  75. return self.container.getPattern(self, i)
  76. def extendX(self, other, patterns) :
  77. self.container.extend(self, other, patterns)
  78. def eliminateFeatures(self, featureList):
  79. """eliminate a list of features from a dataset
  80. INPUT:
  81. featureList - a list of features to eliminate; these are numbers
  82. between 0 and numFeatures-1 (indices of features, not their IDs)"""
  83. if len(featureList) == 0 : return
  84. if type(featureList[0]) == type('') :
  85. featureList = self.featureNames2IDs(featureList)
  86. featureList.sort()
  87. if type(featureList) != list :
  88. featureList = list(featureList)
  89. if max(featureList) >= self.numFeatures or min(featureList) < 0 :
  90. raise ValueError, 'Bad feature list'
  91. cfeatureList = arrayWrap.intVector(featureList)
  92. self.container.eliminateFeatures(self, cfeatureList)
  93. self.updateFeatureDict(featureList)
  94. def scale(self, w) :
  95. """rescale the columns of the data matrix by a weight vector w:
  96. set X[i][j] = X[i][j] * w[j]
  97. """
  98. if type(w) == type(1.0) :
  99. w = [w for i in range(self.numFeatures)]
  100. if type(w) != type([]) :
  101. w = list(w)
  102. #numpy.ones(self.numFeatures, numpy.float_) * w
  103. self.container.scale(self, w)
  104. def translate(self, c) :
  105. if type(c) != type([]) :
  106. c = list(c)
  107. self.container.translate(self, c)
  108. def mean(self, patterns = None) :
  109. if patterns is None : patterns = range(len(self))
  110. if min(patterns) < 0 or max(patterns) >= len(self) :
  111. raise ValueError, 'Pattern index out of range'
  112. cpatterns = arrayWrap.intVector(patterns)
  113. return self.container.mean(self, cpatterns)
  114. def std(self, patterns = None) :
  115. if patterns is None : patterns = range(len(self))
  116. if type(patterns) != type([]) : patterns = list(patterns)
  117. if min(patterns) < 0 or max(patterns) >= len(self) :
  118. raise ValueError, 'Pattern index out of range'
  119. cpatterns = arrayWrap.intVector(patterns)
  120. return self.container.standardDeviation(self, cpatterns)
  121. def featureCount(self, feature, patterns = None) :
  122. if patterns is None : patterns = range(len(self))
  123. if type(patterns) != type([]) : patterns = list(patterns)
  124. if min(patterns) < 0 or max(patterns) >= len(self) :
  125. raise ValueError, 'Pattern index out of range'
  126. cpatterns = arrayWrap.intVector(patterns)
  127. return self.container.featureCount(self, feature, cpatterns)
  128. def featureCounts(self, patterns = None) :
  129. if patterns is None : patterns = range(len(self))
  130. if type(patterns) != type([]) : patterns = list(patterns)
  131. if min(patterns) < 0 or max(patterns) >= len(self) :
  132. raise ValueError, 'Pattern index out of range'
  133. cpatterns = arrayWrap.intVector(patterns)
  134. return self.container.featureCounts(self, cpatterns)
  135. def nonzero(self, feature, patterns = None) :
  136. if patterns is None : patterns = range(len(self))
  137. if type(patterns) != type([]) : patterns = list(patterns)
  138. if min(patterns) < 0 or max(patterns) >= len(self) :
  139. raise ValueError, 'Pattern index goes outside of range'
  140. cpatterns = arrayWrap.intVector(patterns)
  141. return self.container.nonzero(self, feature, cpatterns)
  142. def commonFeatures(self, pattern1, pattern2) :
  143. return [self.featureKeyDict[featureKey] for featureKey in
  144. self.container.commonFeatures(self, pattern1, pattern2)]
  145. def normalize(self, norm=2) :
  146. norm = int(norm)
  147. if norm not in [1,2] :
  148. raise ValueError, 'bad value for norm'
  149. self.container.normalize(self, norm)
  150. class VectorDataSet (BaseCVectorDataSet, cvectordataset.VectorDataSet) :
  151. def __init__(self, arg = None, **args):
  152. BaseCVectorDataSet.__init__(self)
  153. BaseVectorDataSet.__init__(self, arg, **args)
  154. def addPattern(self, x, i) :
  155. if type(x) == type(numpy.array(1)) or type(x) == type([]) :
  156. values = arrayWrap.doubleVector(x)
  157. else:
  158. raise TypeError, "data vectors must be list or array"
  159. self.container.addPattern(self, values)
  160. def updateFeatureDict(self, arg = None) :
  161. if arg.__class__ == self.__class__ :
  162. # features were extended with those in another dataset
  163. other = arg
  164. self.featureID.extend(other.featureID)
  165. elif type(arg) == list :
  166. print 'recalculating feature ID'
  167. #features were eliminated:
  168. eliminated = misc.list2dict(arg)
  169. self.featureID = [self.featureID[i] for i in range(len(self.featureID))
  170. if i not in eliminated]
  171. elif type(arg) == type(1) or type(arg) == type('') :
  172. # a feature was added
  173. id = arg
  174. self.featureID.append(id)
  175. self.featureDict[id] = self.numFeatures - 1
  176. return
  177. self.featureDict = {}
  178. for i in range(len(self.featureID)) :
  179. self.featureDict[self.featureID[i]] = i
  180. class SparseDataSet (BaseCVectorDataSet, csparsedataset.SparseDataSet) :
  181. def __init__(self, arg = None, **args):
  182. BaseCVectorDataSet.__init__(self)
  183. BaseVectorDataSet.__init__(self, arg, **args)
  184. def updateFeatureDict(self, arg = None) :
  185. if arg.__class__ == self.__class__ :
  186. other = arg
  187. self.featureID.extend(other.featureID)
  188. self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y)))
  189. elif type(arg) == list :
  190. #features were eliminated:
  191. eliminated = misc.list2dict(arg)
  192. self.featureID = [self.featureID[i] for i in range(len(self.featureID))
  193. if i not in eliminated]
  194. elif type(arg) == type(1) or type(arg) == type('') :
  195. # a feature was added:
  196. id = arg
  197. self.featureID.append(id)
  198. self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y)))
  199. self.featureDict = {}
  200. self.featureKeyDict = {}
  201. for i in range(len(self.featureID)) :
  202. self.featureDict[self.featureID[i]] = i
  203. self.featureKeyDict[hash(self.featureID[i])] = i