/soggle/features.py
Python | 316 lines | 301 code | 13 blank | 2 comment | 2 complexity | 8b7b1a5636cf0f740faaec69d392ea6e MD5 | raw file
- import dateutil
- import re
- import nltk
- import numpy as np
- import pandas
- from scipy import sparse
- from sklearn.base import BaseEstimator
- from sklearn.utils import shuffle
- from .introspect import print_stats
- from .util import cached
- from .util import timed
- df_converters = {
- 'PostCreationDate': dateutil.parser.parse,
- 'OwnerCreationDate': dateutil.parser.parse,
- }
- def read_dataframe(filename, converters=df_converters):
- with timed("Reading dataframe from %s ..." % filename):
- df = pandas.io.parsers.read_csv(filename, converters=df_converters)
- return df
- class Dataset(object):
- classes = {
- 'not a real question': 0,
- 'not constructive': 1,
- 'off topic': 2,
- 'open': 3,
- 'too localized': 4,
- }
- n_classes = len(set(classes.values()))
- def __init__(self, filename, truncate_start=0.0, truncate_end=1.0):
- df = read_dataframe(filename) # load directly with numpy?
- truncate_start = df.shape[0] * truncate_start
- truncate_end = df.shape[0] * truncate_end
- df = df[truncate_start:truncate_end]
- print_stats(df)
- if "OpenStatus" in df:
- self.target = np.array(df["OpenStatus"].map(self.classes))
- del df['OpenStatus']
- del df['PostClosedDate']
- self.data = np.array(df)
- # Convert nan tags to ''
- self.data[:, 8:13][self.data[:, 8:13] == np.nan] = ''
- @property
- def categories(self):
- items = sorted(self.classes.items(), key=lambda p: p[1])
- return [p[0] for p in items]
- def shuffle(self, random_state=43):
- self.data, self.target = shuffle(
- self.data, self.target, random_state=random_state)
- class TextExtractor(BaseEstimator):
- def __init__(
- self,
- include_title=2,
- include_body=True,
- include_tags=1,
- replace_code=False,
- ):
- self.include_title = include_title
- self.include_body = include_body
- self.include_tags = include_tags
- self.replace_code = replace_code
- def _get_param_names(self):
- return (
- 'include_title',
- 'include_body',
- 'include_tags',
- 'replace_code',
- )
- @staticmethod
- def _replace_code(text):
- # XXX turn into regex replace
- result = ''
- for line in text.splitlines(True):
- if line.startswith(' ') or line.startswith('\t'):
- result += 'CODE\n'
- else:
- result += line
- return result
- def fit(self, X, y=None):
- return self
- def transform(self, X):
- text = ''
- if self.include_title:
- title = X[:, 6]
- text += (title + '\n') * self.include_title
- if self.include_body:
- body = X[:, 7]
- if self.replace_code:
- body = np.array([self._replace_code(b) for b in body],
- dtype=object)
- text += body + '\n'
- if self.include_tags:
- for num in range(0, 5):
- text += (X[:, 8 + num] + ' ') * self.include_tags
- return text
- # Blatanlty stolen from https://github.com/amueller/kaggle_insults
- class DensifyTransformer(BaseEstimator):
- def fit(self, X, y=None):
- return self
- def transform(self, X):
- if sparse.issparse(X):
- X = X.toarray()
- return X
- # Blatanlty stolen from https://github.com/amueller/kaggle_insults
- class FeatureStacker(BaseEstimator):
- """Stacks several transformer objects to yield concatenated features.
- Similar to pipeline, a list of tuples ``(name, estimator)`` is passed
- to the constructor.
- """
- def __init__(self, transformer_list, verbose=0):
- self.transformer_list = transformer_list
- self.verbose = verbose
- def get_feature_names(self):
- pass
- def fit(self, X, y=None):
- for name, trans in self.transformer_list:
- trans.fit(X, y)
- return self
- def transform(self, X):
- features = []
- for name, trans in self.transformer_list:
- features.append(trans.transform(X))
- issparse = [sparse.issparse(f) for f in features]
- if np.any(issparse):
- if len(features) > 1:
- features = sparse.hstack(features).tocsr()
- else:
- features = features[0].tocsr()
- else:
- features = np.hstack(features)
- if self.verbose:
- print "[FeatureStacker] Collected %s features." % (features.shape,)
- return features
- def get_params(self, deep=True):
- if not deep:
- return super(FeatureStacker, self).get_params(deep=False)
- else:
- out = dict(self.transformer_list)
- for name, trans in self.transformer_list:
- for key, value in trans.get_params(deep=True).iteritems():
- out['%s__%s' % (name, key)] = value
- return out
- def _transform_cache_key(self, X):
- return ','.join([
- str(X[:20]),
- str(X[-20:]),
- str(X.shape),
- str(sorted(self.get_params().items())),
- ])
- def multiply_features(X, names=None):
- new_features = []
- new_names = []
- for i in range(X.shape[1] - 1):
- for j in range(i + 1, X.shape[1]):
- new_features.append(X[:, i] * X[:, j])
- if names is not None:
- new_names.append('%s X %s' % (names[i], names[j]))
- new_features = np.vstack(new_features).T
- if new_names:
- new_features = new_features, new_names
- return new_features
- class TextFeatures(BaseEstimator):
- """Features from the posts' texts. Does not include tf-idf.
- """
- token_pattern = re.compile(ur"\b\w\w+\b")
- feature_names = np.array([
- 'n_words',
- 'n_chars',
- 'n_words_title',
- 'n_chars_title',
- 'n_tags',
- 'question2',
- 'excla2',
- 'paragraphs',
- 'urls',
- ])
- def get_feature_names(self):
- return self.feature_names
- def fit(self, X, y=None):
- return self
- # Beware: the cached decorator potentially hides any changes you
- # make inside this method.
- @cached(_transform_cache_key)
- def transform(self, X):
- documents = TextExtractor().transform(X)
- titles = X[:, 6]
- word_lists = [
- np.array(self.token_pattern.findall(d)) for d in documents]
- n_words = np.array([len(w) for w in word_lists], dtype=float)
- n_chars = [len(d) for d in documents]
- n_words_title = [len(t.split()) for t in titles]
- n_chars_title = [len(t) for t in titles]
- n_tags = [len(d[d.rfind('\n'):].split()) for d in documents]
- question2 = [c.count("??") for c in documents]
- excla2 = [c.count("!!") for c in documents]
- paragraphs = [c.count("\n\n") for c in documents]
- urls = [c.count("://") for c in documents]
- features = np.array([
- n_words,
- n_chars,
- n_words_title,
- n_chars_title,
- n_tags,
- question2,
- excla2,
- paragraphs,
- urls,
- ]).T
- return np.hstack([
- features,
- (features + 0.1) ** -1.0,
- ])
- class LanguageFeatures(BaseEstimator):
- features_names = np.array([
- ])
- def get_feature_names(self):
- return self.feature_names
- def fit(self, X, y=None):
- return self
- def transform(self, X):
- titles = X[:, 6]
- tagged = []
- for title in titles:
- tokens = nltk.word_tokenize(title)
- pos_tagged = nltk.pos_tag(tokens)
- tagged.append(' '.join([t[1] for t in pos_tagged]))
- return np.array(tagged)
- class PostFeatures(BaseEstimator):
- """Features extracted from post metadata, such as dates.
- """
- features_names = np.array([
- 'user_age',
- 'reputation_at_post_creation',
- 'owner_undeleted_answer_count_at_post_time',
- ])
- def get_feature_names(self):
- return self.feature_names
- def fit(self, X, y=None):
- return self
- def transform(self, X):
- post_creation_date = X[:, 1]
- owner_creation_date = X[:, 3]
- seconds = np.vectorize(lambda x: x.seconds)
- user_age = seconds(post_creation_date - owner_creation_date)
- reputation_at_post_creation = X[:, 4]
- owner_undeleted_answer_count_at_post_time = X[:, 5]
- features = np.array([
- user_age,
- reputation_at_post_creation.astype('float'),
- owner_undeleted_answer_count_at_post_time.astype('float'),
- ]).T
- return np.hstack([features, (features + 0.1) ** -1.0])