features.py | searchcode

/soggle/features.py

https://bitbucket.org/dnouri/soggle
Python | 316 lines | 301 code | 13 blank | 2 comment | 2 complexity | 8b7b1a5636cf0f740faaec69d392ea6e MD5 | raw file

import dateutil
import re

import nltk
import numpy as np
import pandas
from scipy import sparse
from sklearn.base import BaseEstimator
from sklearn.utils import shuffle

from .introspect import print_stats
from .util import cached
from .util import timed


df_converters = {
    'PostCreationDate': dateutil.parser.parse,
    'OwnerCreationDate': dateutil.parser.parse,
    }


def read_dataframe(filename, converters=df_converters):
    with timed("Reading dataframe from %s ..." % filename):
        df = pandas.io.parsers.read_csv(filename, converters=df_converters)
    return df


class Dataset(object):
    classes = {
        'not a real question': 0,
        'not constructive': 1,
        'off topic': 2,
        'open': 3,
        'too localized': 4,
        }
    n_classes = len(set(classes.values()))

    def __init__(self, filename, truncate_start=0.0, truncate_end=1.0):
        df = read_dataframe(filename)  # load directly with numpy?

        truncate_start = df.shape[0] * truncate_start
        truncate_end = df.shape[0] * truncate_end
        df = df[truncate_start:truncate_end]
        print_stats(df)

        if "OpenStatus" in df:
            self.target = np.array(df["OpenStatus"].map(self.classes))
            del df['OpenStatus']
            del df['PostClosedDate']

        self.data = np.array(df)

        # Convert nan tags to ''
        self.data[:, 8:13][self.data[:, 8:13] == np.nan] = ''

    @property
    def categories(self):
        items = sorted(self.classes.items(), key=lambda p: p[1])
        return [p[0] for p in items]

    def shuffle(self, random_state=43):
        self.data, self.target = shuffle(
            self.data, self.target, random_state=random_state)


class TextExtractor(BaseEstimator):
    def __init__(
        self,
        include_title=2,
        include_body=True,
        include_tags=1,
        replace_code=False,
        ):
        self.include_title = include_title
        self.include_body = include_body
        self.include_tags = include_tags
        self.replace_code = replace_code

    def _get_param_names(self):
        return (
            'include_title',
            'include_body',
            'include_tags',
            'replace_code',
            )

    @staticmethod
    def _replace_code(text):
        # XXX turn into regex replace
        result = ''
        for line in text.splitlines(True):
            if line.startswith('    ') or line.startswith('\t'):
                result += 'CODE\n'
            else:
                result += line
        return result

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        text = ''

        if self.include_title:
            title = X[:, 6]
            text += (title + '\n') * self.include_title

        if self.include_body:
            body = X[:, 7]
            if self.replace_code:
                body = np.array([self._replace_code(b) for b in body],
                                dtype=object)
            text += body + '\n'

        if self.include_tags:
            for num in range(0, 5):
                text += (X[:, 8 + num] + ' ') * self.include_tags

        return text


# Blatanlty stolen from https://github.com/amueller/kaggle_insults
class DensifyTransformer(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if sparse.issparse(X):
            X = X.toarray()
        return X


# Blatanlty stolen from https://github.com/amueller/kaggle_insults
class FeatureStacker(BaseEstimator):
    """Stacks several transformer objects to yield concatenated features.
    Similar to pipeline, a list of tuples ``(name, estimator)`` is passed
    to the constructor.
    """
    def __init__(self, transformer_list, verbose=0):
        self.transformer_list = transformer_list
        self.verbose = verbose

    def get_feature_names(self):
        pass

    def fit(self, X, y=None):
        for name, trans in self.transformer_list:
            trans.fit(X, y)
        return self

    def transform(self, X):
        features = []
        for name, trans in self.transformer_list:
            features.append(trans.transform(X))
        issparse = [sparse.issparse(f) for f in features]
        if np.any(issparse):
            if len(features) > 1:
                features = sparse.hstack(features).tocsr()
            else:
                features = features[0].tocsr()
        else:
            features = np.hstack(features)
        if self.verbose:
            print "[FeatureStacker] Collected %s features." % (features.shape,)
        return features

    def get_params(self, deep=True):
        if not deep:
            return super(FeatureStacker, self).get_params(deep=False)
        else:
            out = dict(self.transformer_list)
            for name, trans in self.transformer_list:
                for key, value in trans.get_params(deep=True).iteritems():
                    out['%s__%s' % (name, key)] = value
            return out


def _transform_cache_key(self, X):
    return ','.join([
        str(X[:20]),
        str(X[-20:]),
        str(X.shape),
        str(sorted(self.get_params().items())),
        ])


def multiply_features(X, names=None):
    new_features = []
    new_names = []
    for i in range(X.shape[1] - 1):
        for j in range(i + 1, X.shape[1]):
            new_features.append(X[:, i] * X[:, j])
            if names is not None:
                new_names.append('%s X %s' % (names[i], names[j]))

    new_features = np.vstack(new_features).T
    if new_names:
        new_features = new_features, new_names

    return new_features


class TextFeatures(BaseEstimator):
    """Features from the posts' texts.  Does not include tf-idf.
    """
    token_pattern = re.compile(ur"\b\w\w+\b")

    feature_names = np.array([
        'n_words',
        'n_chars',
        'n_words_title',
        'n_chars_title',
        'n_tags',
        'question2',
        'excla2',
        'paragraphs',
        'urls',
        ])

    def get_feature_names(self):
        return self.feature_names

    def fit(self, X, y=None):
        return self

    # Beware: the cached decorator potentially hides any changes you
    # make inside this method.
    @cached(_transform_cache_key)
    def transform(self, X):
        documents = TextExtractor().transform(X)
        titles = X[:, 6]

        word_lists = [
            np.array(self.token_pattern.findall(d)) for d in documents]

        n_words = np.array([len(w) for w in word_lists], dtype=float)
        n_chars = [len(d) for d in documents]
        n_words_title = [len(t.split()) for t in titles]
        n_chars_title = [len(t) for t in titles]
        n_tags = [len(d[d.rfind('\n'):].split()) for d in documents]
        question2 = [c.count("??") for c in documents]
        excla2 = [c.count("!!") for c in documents]
        paragraphs = [c.count("\n\n") for c in documents]
        urls = [c.count("://") for c in documents]

        features = np.array([
            n_words,
            n_chars,
            n_words_title,
            n_chars_title,
            n_tags,
            question2,
            excla2,
            paragraphs,
            urls,
            ]).T

        return np.hstack([
            features,
            (features + 0.1) ** -1.0,
            ])


class LanguageFeatures(BaseEstimator):
    features_names = np.array([
        ])

    def get_feature_names(self):
        return self.feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        titles = X[:, 6]

        tagged = []
        for title in titles:
            tokens = nltk.word_tokenize(title)
            pos_tagged = nltk.pos_tag(tokens)
            tagged.append(' '.join([t[1] for t in pos_tagged]))

        return np.array(tagged)


class PostFeatures(BaseEstimator):
    """Features extracted from post metadata, such as dates.
    """
    features_names = np.array([
        'user_age',
        'reputation_at_post_creation',
        'owner_undeleted_answer_count_at_post_time',
        ])

    def get_feature_names(self):
        return self.feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        post_creation_date = X[:, 1]
        owner_creation_date = X[:, 3]
        seconds = np.vectorize(lambda x: x.seconds)
        user_age = seconds(post_creation_date - owner_creation_date)

        reputation_at_post_creation = X[:, 4]
        owner_undeleted_answer_count_at_post_time = X[:, 5]

        features = np.array([
            user_age,
            reputation_at_post_creation.astype('float'),
            owner_undeleted_answer_count_at_post_time.astype('float'),
            ]).T

        return np.hstack([features, (features + 0.1) ** -1.0])