/utils/data_process.py

https://github.com/YZHANG1270/Aspect-Based-Sentiment-Analysis · Python · 99 lines · 72 code · 14 blank · 13 comment · 14 complexity · 33c1c31f1f2f5eab0ade1ef998ae4e15 MD5 · raw file

  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. __author__ = 'ZhangYi'
  4. import ast
  5. import jieba
  6. import itertools
  7. import pandas as pd
  8. import numpy as np
  9. from keras.preprocessing.sequence import pad_sequences
  10. # mark NaN as 'OTHERS'
  11. def nan_to_others(df):
  12. new_cate = []
  13. new_polarity = []
  14. # dataframe必须含有列['text', 'category', 'polarity']
  15. for idx, i in enumerate(df['polarity']):
  16. if i in ['negative', 'positive', 'neutral', 'conflict']:
  17. new_cate.append(df['category'][idx])
  18. new_polarity.append(i)
  19. else:
  20. new_cate.append('OTHERS')
  21. new_polarity.append('OTHERS')
  22. _df = pd.DataFrame(np.array([df['text'], new_cate, new_polarity]).T, columns=['text', 'category', 'polarity'])
  23. return _df
  24. # tokenize
  25. def seg_words(contents):
  26. contents_segs = list()
  27. for content in contents:
  28. segs = jieba.lcut(content)
  29. contents_segs.append(" ".join(segs))
  30. return contents_segs
  31. # get text vector
  32. def gen_text_vec(tokenizer, cut_corpus_list, maxlen):
  33. text_vec = tokenizer.texts_to_sequences(cut_corpus_list)
  34. t_vec = pad_sequences(text_vec, maxlen=maxlen)
  35. return t_vec
  36. # category transpose
  37. def category_transpose(df, category_list):
  38. for i in category_list:
  39. l_ist = []
  40. # dataframe必须含有列['category']
  41. for cate in df['category']:
  42. if cate == i:
  43. l_ist.append(1)
  44. else:
  45. l_ist.append(0)
  46. df[i] = l_ist
  47. return df
  48. # load config: aspect_list
  49. def load_aspect_list(path_config):
  50. # only one param in config: aspect_list
  51. a = 0
  52. with open(path_config, "r", encoding='utf-8') as f:
  53. for i in f:
  54. category_list = ast.literal_eval(i)['aspect_list']
  55. a = a + 1
  56. if a == 1:
  57. break
  58. f.close()
  59. return category_list
  60. # merge excel
  61. def merge_excel(path_data_dir):
  62. cmt_l = []
  63. scr_l = []
  64. # 被merge的df都必须有['comment_content', 'label']
  65. data_source = ['/2019-04-12_lock_comment_jd_spider_baidu_sentiment.xlsx', \
  66. '/20190329_train_lock_comments_document_level_with_label.xls', \
  67. '/all_comments_document_level_without_lock_comments.xls', \
  68. '/bad_comments_in_forum_mi.com_youpin.xls']
  69. for i in data_source:
  70. path_data = path_data_dir + i
  71. _data = pd.read_excel(path_data)
  72. cmt_l.append(_data['comment_content'])
  73. scr_l.append(_data['label'])
  74. comment = list(itertools.chain.from_iterable(cmt_l))
  75. score = list(itertools.chain.from_iterable(scr_l))
  76. data = pd.DataFrame(np.array([comment, score]).T, columns=['comment_content', 'label'])
  77. return data
  78. # remove row by column with empty value
  79. def remove_empty_row(df, column_name):
  80. row_to_delete = []
  81. for idx, i in enumerate(df[column_name]):
  82. if not bool(i):
  83. row_to_delete.append(idx)
  84. df = df.drop(df.index[row_to_delete])
  85. return df.reset_index(drop=True)