PageRenderTime 53ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/stumble-evergreen/src/algo/LibFMFeatures.py

https://gitlab.com/xbsd/kaggle-1
Python | 191 lines | 122 code | 27 blank | 42 comment | 8 complexity | 437ca67010d84786583a3e2ea12b6eb6 MD5 | raw file
  1. import string
  2. from string import lower
  3. import numpy as np
  4. from sklearn.feature_extraction import DictVectorizer
  5. from sklearn.feature_extraction.text import TfidfVectorizer
  6. from sklearn import preprocessing
  7. from sklearn_pandas import DataFrameMapper
  8. from sklearn.datasets.svmlight_format import dump_svmlight_file
  9. from algo.base_mode import BaseModel
  10. from sklearn_pandas import DataFrameMapper
  11. from common.utils import FILE_SEPERATOR
  12. from sklearn import pipeline
  13. from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  14. output_train_libsvm_file = "../data/train.libfm"
  15. def get_text_transformer():
  16. return TfidfVectorizer(min_df=3, max_features=None,
  17. strip_accents='unicode',
  18. analyzer='word', token_pattern=r'\w{1,}',
  19. ngram_range=(1, 2), use_idf=1, smooth_idf=1,
  20. sublinear_tf=1, norm='l2')
  21. #def get_dict_pipeline():
  22. # return pipeline.Pipeline(
  23. # [('vectorizer', CountVectorizer()), ('transformer', TfidfTransformer()), ])
  24. def get_dict_pipeline():
  25. return DictVectorizer()
  26. def get_norm_scaler():
  27. return preprocessing.StandardScaler()
  28. class LibFMFeatures(BaseModel):
  29. def __init__(self, stumble_data):
  30. super(LibFMFeatures, self).__init__(stumble_data)
  31. self.feat_head = self.get_feature_list()
  32. def get_tranform_features(self):
  33. return self.transform_features()
  34. def get_feature_list(self):
  35. # ['field_name', 'ignored_function_name', 'pipeline_fit', 'is_dict', 'is_enabled']
  36. feat_head = [
  37. # ['label', 'get_label', None, True],
  38. # ['alchemy_category', None, get_dict_pipeline(), True],
  39. # ['urlid', None, get_dict_pipeline(), True],
  40. # ['alchemy_category_score', None,get_norm_scaler(), True],
  41. # ['avglinksize', None, get_norm_scaler(), True],
  42. # ['commonlinkratio_1', None, get_norm_scaler(), True],
  43. # ['commonlinkratio_2', None, get_norm_scaler(), True],
  44. # ['commonlinkratio_3', None, get_norm_scaler(), True],
  45. # ['commonlinkratio_4', None, get_norm_scaler(), True],
  46. # ['news_front_page', None, get_dict_pipeline(), True],
  47. # ['framebased', None, get_dict_pipeline(),True], #
  48. # ['is_news', None, get_dict_pipeline(),True],
  49. # ['hasDomainLink', None, get_dict_pipeline(),True],
  50. # ['lengthyLinkDomain', None, get_dict_pipeline(), True],
  51. # ['non_markup_alphanum_characters', None, get_norm_scaler(), True],
  52. # ['numberOfLinks', None, get_norm_scaler(), True],
  53. # ['embed_ratio', None, get_norm_scaler(), True],
  54. # ['frameTagRatio', None, get_norm_scaler(), True],
  55. # ['html_ratio', None, get_norm_scaler(), True],
  56. # ['linkwordscore', None, get_norm_scaler(), True],
  57. # ['numwords_in_url', None, get_norm_scaler(), True],
  58. # ['parametrizedLinkRatio', None, get_norm_scaler(), True],
  59. # ['spelling_errors_ratio', None, get_norm_scaler(), True],
  60. # ['compression_ratio', None, get_norm_scaler(), True],
  61. # ['image_ratio', None, get_norm_scaler(), True],
  62. ['boilerplate', None, get_text_transformer(), True],
  63. # ['boilerplate_body', None, get_text_transformer(), True],
  64. # ['boilerplate_title', None, get_text_transformer(), True],
  65. # ['boilerplate_url', None, get_text_transformer(), True],
  66. # ['boilerplate_body_len', self.get_boilerplate_body_len, get_norm_scaler(), True],
  67. # ['boilerplate_url_len', self.get_boilerplate_url_len, get_norm_scaler(), True],
  68. # ['boilerplate_title_len', self.get_boilerplate_title_len, get_norm_scaler(), True],
  69. # ['boilerplate_title_num_word', self.get_boilerplate_title_num_word, get_norm_scaler(), True],
  70. # ['boilerplate_body_num_word', self.get_boilerplate_body_num_word, get_norm_scaler(), True],
  71. # ['boilerplate_url_num_word', self.get_boilerplate_url_num_word, get_norm_scaler(), True],
  72. ]
  73. return feat_head
  74. def transform_features(self):
  75. totransform = []
  76. for index, item in enumerate(self.feat_head):
  77. field = item[0]
  78. func_name = item[1]
  79. transform = item[2]
  80. is_enable = item[3]
  81. if is_enable:
  82. if not field in self.stumble_data.get_features():
  83. print 'field not in feature..generating:' + field
  84. func_name(field)
  85. totransform.append((field, transform))
  86. if len(totransform):
  87. mapper = DataFrameMapper(totransform)
  88. mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train])
  89. #
  90. X_transformed_train = mapper.transform(
  91. self.stumble_data.all_pd[:self.stumble_data.len_train])
  92. X_transformed_test = mapper.transform(
  93. self.stumble_data.all_pd[self.stumble_data.len_train:])
  94. for index, item in enumerate(self.feat_head):
  95. field = item[0]
  96. is_enable = item[3]
  97. if is_enable and field in self.stumble_data.get_features():
  98. del self.stumble_data.all_pd[field]
  99. import pdb
  100. pdb.set_trace()
  101. from scipy.sparse import hstack
  102. X_train = X_transformed_train
  103. X_test = X_transformed_test
  104. y_train = self.stumble_data.all_pd[:self.stumble_data.len_train]['label']
  105. # print 'Dumping train in SVMLight.'
  106. dump_svmlight_file(X_train, y_train, output_train_libsvm_file )
  107. # print 'Dumping test in SVMLight.'
  108. # dump_svmlight_file(X_test, pred, output_test_libsvm_file )
  109. else:
  110. X_train = X_train.as_matrix()
  111. X_test = X_test.as_matrix()
  112. return X_train, y_train, X_test
  113. def get_boilerplate_url_len(self, key):
  114. apply_key = 'boilerplate_url'
  115. X= self.stumble_data.all_pd
  116. X[key] = X[apply_key].fillna("").apply(
  117. lambda x: len(x) if len(x) else None)
  118. X[key] = X[key].fillna(X[key].mean())
  119. def get_boilerplate_title_len(self, key):
  120. apply_key = 'boilerplate_title'
  121. X= self.stumble_data.all_pd
  122. X[key] = X[apply_key].fillna("").apply(
  123. lambda x: len(x) if len(x) else None)
  124. X[key] = X[key].fillna(X[key].mean())
  125. def get_boilerplate_body_len(self, key):
  126. apply_key = 'boilerplate_body'
  127. X= self.stumble_data.all_pd
  128. X[key] = X[apply_key].fillna("").apply(
  129. lambda x: len(x) if len(x) else None)
  130. X[key] = X[key].fillna(X[key].mean())
  131. def get_boilerplate_body_num_word(self, key):
  132. apply_key = 'boilerplate_body'
  133. X= self.stumble_data.all_pd
  134. X[key] = X[apply_key].map(lambda x: len(x.split()) if not isinstance(x, float) else False)
  135. def get_boilerplate_title_num_word(self, key):
  136. apply_key = 'boilerplate_title'
  137. X= self.stumble_data.all_pd
  138. X[key] = X[apply_key].map(lambda x: len(x.split()) if not isinstance(x, float) else False)
  139. def get_boilerplate_url_num_word(self, key):
  140. apply_key = 'boilerplate_url'
  141. X= self.stumble_data.all_pd
  142. X[key] = X[apply_key].map(lambda x: len(x.split()) if not isinstance(x, float) else False)
  143. def get_func_val(self, func_name, page, page_test, page_item, field):
  144. if func_name is not None:
  145. return func_name(self.page, self.page_test, page_item)
  146. else:
  147. return self.get_field_value(self.page, self.page_test, page_item,
  148. field)
  149. def precison(self, val):
  150. return "%.2f" % (val)