PageRenderTime 45ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/soggle/features.py

https://bitbucket.org/dnouri/soggle
Python | 316 lines | 301 code | 13 blank | 2 comment | 2 complexity | 8b7b1a5636cf0f740faaec69d392ea6e MD5 | raw file
  1. import dateutil
  2. import re
  3. import nltk
  4. import numpy as np
  5. import pandas
  6. from scipy import sparse
  7. from sklearn.base import BaseEstimator
  8. from sklearn.utils import shuffle
  9. from .introspect import print_stats
  10. from .util import cached
  11. from .util import timed
  12. df_converters = {
  13. 'PostCreationDate': dateutil.parser.parse,
  14. 'OwnerCreationDate': dateutil.parser.parse,
  15. }
  16. def read_dataframe(filename, converters=df_converters):
  17. with timed("Reading dataframe from %s ..." % filename):
  18. df = pandas.io.parsers.read_csv(filename, converters=df_converters)
  19. return df
  20. class Dataset(object):
  21. classes = {
  22. 'not a real question': 0,
  23. 'not constructive': 1,
  24. 'off topic': 2,
  25. 'open': 3,
  26. 'too localized': 4,
  27. }
  28. n_classes = len(set(classes.values()))
  29. def __init__(self, filename, truncate_start=0.0, truncate_end=1.0):
  30. df = read_dataframe(filename) # load directly with numpy?
  31. truncate_start = df.shape[0] * truncate_start
  32. truncate_end = df.shape[0] * truncate_end
  33. df = df[truncate_start:truncate_end]
  34. print_stats(df)
  35. if "OpenStatus" in df:
  36. self.target = np.array(df["OpenStatus"].map(self.classes))
  37. del df['OpenStatus']
  38. del df['PostClosedDate']
  39. self.data = np.array(df)
  40. # Convert nan tags to ''
  41. self.data[:, 8:13][self.data[:, 8:13] == np.nan] = ''
  42. @property
  43. def categories(self):
  44. items = sorted(self.classes.items(), key=lambda p: p[1])
  45. return [p[0] for p in items]
  46. def shuffle(self, random_state=43):
  47. self.data, self.target = shuffle(
  48. self.data, self.target, random_state=random_state)
  49. class TextExtractor(BaseEstimator):
  50. def __init__(
  51. self,
  52. include_title=2,
  53. include_body=True,
  54. include_tags=1,
  55. replace_code=False,
  56. ):
  57. self.include_title = include_title
  58. self.include_body = include_body
  59. self.include_tags = include_tags
  60. self.replace_code = replace_code
  61. def _get_param_names(self):
  62. return (
  63. 'include_title',
  64. 'include_body',
  65. 'include_tags',
  66. 'replace_code',
  67. )
  68. @staticmethod
  69. def _replace_code(text):
  70. # XXX turn into regex replace
  71. result = ''
  72. for line in text.splitlines(True):
  73. if line.startswith(' ') or line.startswith('\t'):
  74. result += 'CODE\n'
  75. else:
  76. result += line
  77. return result
  78. def fit(self, X, y=None):
  79. return self
  80. def transform(self, X):
  81. text = ''
  82. if self.include_title:
  83. title = X[:, 6]
  84. text += (title + '\n') * self.include_title
  85. if self.include_body:
  86. body = X[:, 7]
  87. if self.replace_code:
  88. body = np.array([self._replace_code(b) for b in body],
  89. dtype=object)
  90. text += body + '\n'
  91. if self.include_tags:
  92. for num in range(0, 5):
  93. text += (X[:, 8 + num] + ' ') * self.include_tags
  94. return text
  95. # Blatanlty stolen from https://github.com/amueller/kaggle_insults
  96. class DensifyTransformer(BaseEstimator):
  97. def fit(self, X, y=None):
  98. return self
  99. def transform(self, X):
  100. if sparse.issparse(X):
  101. X = X.toarray()
  102. return X
  103. # Blatanlty stolen from https://github.com/amueller/kaggle_insults
  104. class FeatureStacker(BaseEstimator):
  105. """Stacks several transformer objects to yield concatenated features.
  106. Similar to pipeline, a list of tuples ``(name, estimator)`` is passed
  107. to the constructor.
  108. """
  109. def __init__(self, transformer_list, verbose=0):
  110. self.transformer_list = transformer_list
  111. self.verbose = verbose
  112. def get_feature_names(self):
  113. pass
  114. def fit(self, X, y=None):
  115. for name, trans in self.transformer_list:
  116. trans.fit(X, y)
  117. return self
  118. def transform(self, X):
  119. features = []
  120. for name, trans in self.transformer_list:
  121. features.append(trans.transform(X))
  122. issparse = [sparse.issparse(f) for f in features]
  123. if np.any(issparse):
  124. if len(features) > 1:
  125. features = sparse.hstack(features).tocsr()
  126. else:
  127. features = features[0].tocsr()
  128. else:
  129. features = np.hstack(features)
  130. if self.verbose:
  131. print "[FeatureStacker] Collected %s features." % (features.shape,)
  132. return features
  133. def get_params(self, deep=True):
  134. if not deep:
  135. return super(FeatureStacker, self).get_params(deep=False)
  136. else:
  137. out = dict(self.transformer_list)
  138. for name, trans in self.transformer_list:
  139. for key, value in trans.get_params(deep=True).iteritems():
  140. out['%s__%s' % (name, key)] = value
  141. return out
  142. def _transform_cache_key(self, X):
  143. return ','.join([
  144. str(X[:20]),
  145. str(X[-20:]),
  146. str(X.shape),
  147. str(sorted(self.get_params().items())),
  148. ])
  149. def multiply_features(X, names=None):
  150. new_features = []
  151. new_names = []
  152. for i in range(X.shape[1] - 1):
  153. for j in range(i + 1, X.shape[1]):
  154. new_features.append(X[:, i] * X[:, j])
  155. if names is not None:
  156. new_names.append('%s X %s' % (names[i], names[j]))
  157. new_features = np.vstack(new_features).T
  158. if new_names:
  159. new_features = new_features, new_names
  160. return new_features
  161. class TextFeatures(BaseEstimator):
  162. """Features from the posts' texts. Does not include tf-idf.
  163. """
  164. token_pattern = re.compile(ur"\b\w\w+\b")
  165. feature_names = np.array([
  166. 'n_words',
  167. 'n_chars',
  168. 'n_words_title',
  169. 'n_chars_title',
  170. 'n_tags',
  171. 'question2',
  172. 'excla2',
  173. 'paragraphs',
  174. 'urls',
  175. ])
  176. def get_feature_names(self):
  177. return self.feature_names
  178. def fit(self, X, y=None):
  179. return self
  180. # Beware: the cached decorator potentially hides any changes you
  181. # make inside this method.
  182. @cached(_transform_cache_key)
  183. def transform(self, X):
  184. documents = TextExtractor().transform(X)
  185. titles = X[:, 6]
  186. word_lists = [
  187. np.array(self.token_pattern.findall(d)) for d in documents]
  188. n_words = np.array([len(w) for w in word_lists], dtype=float)
  189. n_chars = [len(d) for d in documents]
  190. n_words_title = [len(t.split()) for t in titles]
  191. n_chars_title = [len(t) for t in titles]
  192. n_tags = [len(d[d.rfind('\n'):].split()) for d in documents]
  193. question2 = [c.count("??") for c in documents]
  194. excla2 = [c.count("!!") for c in documents]
  195. paragraphs = [c.count("\n\n") for c in documents]
  196. urls = [c.count("://") for c in documents]
  197. features = np.array([
  198. n_words,
  199. n_chars,
  200. n_words_title,
  201. n_chars_title,
  202. n_tags,
  203. question2,
  204. excla2,
  205. paragraphs,
  206. urls,
  207. ]).T
  208. return np.hstack([
  209. features,
  210. (features + 0.1) ** -1.0,
  211. ])
  212. class LanguageFeatures(BaseEstimator):
  213. features_names = np.array([
  214. ])
  215. def get_feature_names(self):
  216. return self.feature_names
  217. def fit(self, X, y=None):
  218. return self
  219. def transform(self, X):
  220. titles = X[:, 6]
  221. tagged = []
  222. for title in titles:
  223. tokens = nltk.word_tokenize(title)
  224. pos_tagged = nltk.pos_tag(tokens)
  225. tagged.append(' '.join([t[1] for t in pos_tagged]))
  226. return np.array(tagged)
  227. class PostFeatures(BaseEstimator):
  228. """Features extracted from post metadata, such as dates.
  229. """
  230. features_names = np.array([
  231. 'user_age',
  232. 'reputation_at_post_creation',
  233. 'owner_undeleted_answer_count_at_post_time',
  234. ])
  235. def get_feature_names(self):
  236. return self.feature_names
  237. def fit(self, X, y=None):
  238. return self
  239. def transform(self, X):
  240. post_creation_date = X[:, 1]
  241. owner_creation_date = X[:, 3]
  242. seconds = np.vectorize(lambda x: x.seconds)
  243. user_age = seconds(post_creation_date - owner_creation_date)
  244. reputation_at_post_creation = X[:, 4]
  245. owner_undeleted_answer_count_at_post_time = X[:, 5]
  246. features = np.array([
  247. user_age,
  248. reputation_at_post_creation.astype('float'),
  249. owner_undeleted_answer_count_at_post_time.astype('float'),
  250. ]).T
  251. return np.hstack([features, (features + 0.1) ** -1.0])