/max_ent.py
Python | 186 lines | 130 code | 40 blank | 16 comment | 34 complexity | fef2584acbbcc8a696e27f076c347ab0 MD5 | raw file
- import sys
- from sklearn.linear_model import LogisticRegression
- from sklearn.feature_extraction import DictVectorizer
- from sklearn.pipeline import Pipeline
- from sklearn.decomposition import TruncatedSVD
- from itertools import combinations
- from copy import deepcopy
-
- #Feature selection.
- def features(sentence, index, tags, syns, trths, keys_to_drop=None):
- d = {
- 'word': sentence[index],
- 'is_first': index == 0,
- 'is_last': index == len(sentence) - 1,
- 'is_capitalized': sentence[index][0].upper() == sentence[index][0],
- 'is_all_caps': sentence[index].upper() == sentence[index],
- 'is_all_lower': sentence[index].lower() == sentence[index],
- 'prefix-1': sentence[index][0],
- 'prefix-2': sentence[index][:2],
- 'prefix-3': sentence[index][:3],
- 'suffix-1': sentence[index][-1],
- 'suffix-2': sentence[index][-2:],
- 'suffix-3': sentence[index][-3:],
- 'prev_word': '' if index == 0 else sentence[index - 1],
- 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
- 'has_hyphen': '-' in sentence[index],
- 'is_numeric': sentence[index].isdigit(),
- 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
- 'pos': tags[index],
- 'prev_pos': '' if index == 0 else tags[index-1],
- 'next_pos': '' if index == len(tags)-1 else tags[index+1],
- 'prev_pos_2': '' if index <= 1 else tags[index - 2],
- 'next_pos_2': '' if index >= len(tags) - 2 else tags[index + 2],
- 'syn': syns[index],
- 'prev_syn': '' if index == 0 else syns[index-1],
- 'next_syn': '' if index == len(syns)-1 else syns[index+1],
- 'prev_syn_2': '' if index <= 1 else syns[index - 2],
- 'next_syn_2': '' if index >= len(syns) - 2 else syns[index + 2],
- 'previous_class': '' if index == 0 else trths[index-1]
- }
-
- if keys_to_drop is not None:
- for key in keys_to_drop:
- d.pop(key)
-
- return d
-
- def get_features_from_file(f, keys_to_drop=None):
-
- lines = f.readlines()
- del lines[0]
-
- X_train = []
- y_train = []
-
- sentence = []
- tags = []
- syn_tags = []
- NER_tag = []
- sentences = []
-
- for line in lines:
-
- if line == "\n":
- continue
-
- line = line.strip().split()
-
- sentence.append(line[0])
- tags.append(line[1])
- syn_tags.append(line[2])
- NER_tag.append(line[3])
-
- if line[1] == '.':
- TEMP = []
- for i in range(len(sentence)):
- X_train.append(features(sentence, i, tags, syn_tags, NER_tag, keys_to_drop))
- TEMP.append(features(sentence, i, tags, syn_tags, NER_tag, keys_to_drop))
- y_train.append(NER_tag[i])
- sentences.append(TEMP)
- sentence = []
- tags = []
- syn_tags = []
- NER_tag = []
-
- return X_train, y_train, sentences
-
-
- def main(language="deu", clf_class=LogisticRegression, keys_to_drop=None, ret_score=False):
-
- X_train, y_train, _ = get_features_from_file(open('data/'+language+'.train', 'r'), keys_to_drop)
-
- addit_X, addit_y, _ = get_features_from_file(open('data/'+language+'.testa', 'r'), keys_to_drop)
-
- X_train.extend(addit_X)
- y_train.extend(addit_y)
-
- clf = Pipeline([
- ('vectoriser', DictVectorizer()),
- ('classifier', clf_class())
- ])
-
- #print("Training!")
- clf.fit(X_train, y_train)
-
- #print('Training done!')
-
- '''
- X_test, y_test, _ = get_features_from_file(open('data/'+language+'.testa', 'r'))
-
- print("Accuracy:", clf.score(X_test, y_test))
- '''
-
- X_test, y_test, _ = get_features_from_file(open('data/' + language + '.testb', 'r'))
-
- #print("Accuracy:", clf.score(X_test, y_test))
-
- if ret_score:
- return float(clf.score(X_test, y_test))
-
- return clf
-
-
-
- if __name__ == '__main__':
-
- keys = ['is_first', 'is_last', 'is_capitalized', 'is_all_caps', 'is_all_lower',
- 'prefix-1', 'prefix-2', 'prefix-3', 'suffix-1', 'suffix-2', 'suffix-3', 'prev_word',
- 'next_word', 'has_hyphen', 'is_numeric', 'capitals_inside', 'pos', 'prev_pos',
- 'next_pos', 'prev_pos_2', 'next_pos_2', 'syn', 'prev_syn', 'next_syn',
- 'prev_syn_2', 'next_syn_2', 'previous_class']
-
- temp = deepcopy(keys)
- baseline_score = main(keys_to_drop=temp, ret_score=True)
- baseline_best = main(ret_score=True)
- print('baseline', baseline_score, 'all_fets', baseline_best)
- delta = 0.001
-
- best_keys_key_spef = []
- best_keys_key_rem = []
- print('From top')
- for key in keys:
- score = main(keys_to_drop=[key], ret_score=True)
- print(key, score)
-
- print('From bottom')
- for key in keys:
- temp = deepcopy(keys)
- temp.remove(key)
- score = main(keys_to_drop=temp, ret_score=True)
- print(key, score)
-
- print('Ground analysis')
- position = ['is_first', 'is_last', 'prev_word', 'next_word']
- word_fets = ['is_capitalized', 'is_all_caps', 'is_numeric']
- char_fets = ['prefix-1', 'prefix-2', 'prefix-3', 'suffix-1', 'suffix-2', 'suffix-3', 'has_hyphen',
- 'capitals_inside']
- pos_fet = ['pos', 'prev_pos', 'next_pos', 'prev_pos_2', 'next_pos_2']
- syn_fet = ['syn', 'prev_syn', 'next_syn', 'prev_syn_2', 'next_syn_2', 'previous_class']
-
- groups = [position, word_fets, char_fets, pos_fet, syn_fet]
-
- for group in groups:
- temp = deepcopy(keys) # Runs clf for key specified
- for key in group:
- temp.remove(key)
- score = main(keys_to_drop=temp, ret_score=True)
- print(group, score)
-
- print('Group removed, not run')
- score = main(keys_to_drop=group, ret_score=True)
- print(group, score)
-
-
-
- '''
- for i in range(2, len(keys)):
- for comb_key in combinations(keys, i):
- temp = deepcopy(keys)
- for k in comb_key:
- temp.remove(k)
- score = main(keys_to_drop=temp, ret_score=True)
- print(list(comb_key), score)
- '''
- #print(best_keys_key_spef)
- #print(best_keys_key_rem)