/src/main/java/ruc/irm/similarity/sentence/morphology/MorphoSimilarity.java

https://github.com/iamxiatian/xsimilarity · Java · 178 lines · 96 code · 26 blank · 56 comment · 17 complexity · 36f7f678c9ea1ff1459ac5e1747f4f9b MD5 · raw file

  1. package ruc.irm.similarity.sentence.morphology;
  2. import java.util.ArrayList;
  3. import java.util.List;
  4. import org.slf4j.Logger;
  5. import org.slf4j.LoggerFactory;
  6. import ruc.irm.similarity.sentence.SegmentProxy;
  7. import ruc.irm.similarity.sentence.SegmentProxy.Word;
  8. import ruc.irm.similarity.sentence.SentenceSimilarity;
  9. import ruc.irm.similarity.word.WordSimilarity;
  10. import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
  11. /**
  12. * 基于词形和词序的句子相似度计算算法,考虑了语义因素<br/>
  13. * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法,在考虑语义时,
  14. * 无法直接获取OnceWS(A, B),因此,采用了两两匹配取最大值的方式。
  15. * 新的改进算法请参考{@code SemanticSimilarity}
  16. *
  17. * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
  18. * @organization 中国人民大学信息资源管理学院 知识工程实验室
  19. *
  20. */
  21. public class MorphoSimilarity implements SentenceSimilarity {
  22. private static Logger LOG = LoggerFactory.getLogger(MorphoSimilarity.class);
  23. /** 词形相似度占总相似度的比重 */
  24. private final double LAMBDA1 = 1.0;
  25. /** 词序相似度占总相似度的比重 */
  26. private final double LAMBDA2 = 0.0;
  27. /** 词语相似度的计算 */
  28. private WordSimilarity wordSimilarity = null;
  29. private static String FILTER_CHARS = "  ,。;?《》()|!,.;?<>|_^…!";
  30. private static MorphoSimilarity instance = null;
  31. public static MorphoSimilarity getInstance(){
  32. if(instance == null){
  33. instance = new MorphoSimilarity();
  34. }
  35. return instance;
  36. }
  37. private MorphoSimilarity(){
  38. LOG.debug("used hownet wordsimilarity.");
  39. this.wordSimilarity = XiaConceptParser.getInstance();
  40. //this.segmenter = SegmentFactory.getInstance().getParser();
  41. }
  42. /**
  43. * 滤掉词串中的空格、标点符号
  44. * @param word_list
  45. * @return
  46. */
  47. private String[] filter(String[] word_list){
  48. List<String> results = new ArrayList<String>();
  49. for(String w:word_list){
  50. if(!FILTER_CHARS.contains(w)){
  51. results.add(w.toLowerCase());
  52. }
  53. }
  54. return results.toArray(new String[results.size()]);
  55. }
  56. /**
  57. * 计算两个句子的相似度
  58. * @see ruc.irm.similarity.Similaritable
  59. */
  60. public double getSimilarity(String firstSen,String secondSen){
  61. //LOG.debug(segmenter.segmentToString(firstSen));
  62. //LOG.debug(segmenter.segmentToString(secondSen));
  63. String[] firstList = filter(segment(firstSen));
  64. String[] secondList = filter(segment(secondSen));
  65. double wordSim = getOccurrenceSimilarity(firstList,secondList);
  66. //LOG.debug("词形相似度="+wordSim);
  67. double orderSim = getOrderSimilarity(firstList,secondList);
  68. //LOG.debug("词序相似度="+orderSim);
  69. return LAMBDA1*wordSim+LAMBDA2*orderSim;
  70. }
  71. /**
  72. * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数
  73. * @param firstList
  74. * @param secondList
  75. * @return
  76. */
  77. public double getOccurrenceSimilarity(String[] firstList, String[] secondList){
  78. int max = firstList.length>secondList.length?firstList.length:secondList.length;
  79. if(max==0){
  80. return 0;
  81. }
  82. //首先计算出所有可能的组合
  83. double[][] scores = new double[max][max];
  84. for(int i=0; i<firstList.length; i++){
  85. for(int j=0; j<secondList.length; j++){
  86. scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
  87. }
  88. }
  89. double total_score = 0;
  90. //从scores[][]中挑选出最大的一个相似度,然后减去该元素,进一步求剩余元素中的最大相似度
  91. while(scores.length > 0){
  92. double max_score = 0;
  93. int max_row = 0;
  94. int max_col = 0;
  95. //先挑出相似度最大的一对:<row, column, max_score>
  96. for(int i=0; i<scores.length; i++){
  97. for(int j=0; j<scores.length; j++){
  98. if(max_score<scores[i][j]){
  99. max_row = i;
  100. max_col = j;
  101. max_score = scores[i][j];
  102. }
  103. }
  104. }
  105. //从数组中去除最大的相似度,继续挑选
  106. double[][] tmp_scores = new double[scores.length-1][scores.length-1];
  107. for(int i=0; i<scores.length; i++){
  108. if(i == max_row) continue;
  109. for(int j=0; j<scores.length; j++){
  110. if(j == max_col) continue;
  111. int tmp_i = max_row>i?i:i-1;
  112. int tmp_j = max_col>j?j:j-1;
  113. tmp_scores[tmp_i][tmp_j] = scores[i][j];
  114. }
  115. }
  116. total_score += max_score;
  117. scores = tmp_scores;
  118. }
  119. return (2*total_score) / (firstList.length + secondList.length);
  120. }
  121. /**
  122. * 获取两个集合的词序相似度
  123. * @param firstList
  124. * @param secondList
  125. * @return
  126. */
  127. public double getOrderSimilarity(String[] firstList, String[] secondList){
  128. double similarity = 0.0;
  129. return similarity;
  130. }
  131. // @SuppressWarnings("unchecked")
  132. // public String[] segment(String sentence){
  133. // MPWordSegment ws = new MPWordSegment();
  134. // ws.parseReader(new StringReader(sentence));
  135. // Vector tokens = ws.getTokens();
  136. // String[] results = new String[tokens.size()];
  137. // for(int i=0; i<tokens.size(); i++){
  138. // Token token = (Token)tokens.get(i);
  139. // results[i] = token.termText();
  140. // }
  141. //
  142. // return results;
  143. // }
  144. public String[] segment(String sentence){
  145. List<Word> list = SegmentProxy.segment(sentence);
  146. String[] results = new String[list.size()];
  147. for(int i=0; i<list.size(); i++){
  148. results[i] = list.get(i).getWord();
  149. }
  150. return results;
  151. }
  152. }