/src/main/java/ruc/irm/similarity/sentence/morphology/MorphoSimilarity.java
https://github.com/iamxiatian/xsimilarity · Java · 178 lines · 96 code · 26 blank · 56 comment · 17 complexity · 36f7f678c9ea1ff1459ac5e1747f4f9b MD5 · raw file
- package ruc.irm.similarity.sentence.morphology;
-
- import java.util.ArrayList;
- import java.util.List;
-
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import ruc.irm.similarity.sentence.SegmentProxy;
- import ruc.irm.similarity.sentence.SegmentProxy.Word;
- import ruc.irm.similarity.sentence.SentenceSimilarity;
- import ruc.irm.similarity.word.WordSimilarity;
- import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
-
- /**
- * 基于词形和词序的句子相似度计算算法,考虑了语义因素<br/>
- * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法,在考虑语义时,
- * 无法直接获取OnceWS(A, B),因此,采用了两两匹配取最大值的方式。
- * 新的改进算法请参考{@code SemanticSimilarity}
- *
- * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
- * @organization 中国人民大学信息资源管理学院 知识工程实验室
- *
- */
- public class MorphoSimilarity implements SentenceSimilarity {
- private static Logger LOG = LoggerFactory.getLogger(MorphoSimilarity.class);
-
- /** 词形相似度占总相似度的比重 */
- private final double LAMBDA1 = 1.0;
- /** 词序相似度占总相似度的比重 */
- private final double LAMBDA2 = 0.0;
- /** 词语相似度的计算 */
- private WordSimilarity wordSimilarity = null;
-
- private static String FILTER_CHARS = " ,。;?《》()|!,.;?<>|_^…!";
-
- private static MorphoSimilarity instance = null;
-
- public static MorphoSimilarity getInstance(){
- if(instance == null){
- instance = new MorphoSimilarity();
- }
- return instance;
- }
-
- private MorphoSimilarity(){
- LOG.debug("used hownet wordsimilarity.");
- this.wordSimilarity = XiaConceptParser.getInstance();
- //this.segmenter = SegmentFactory.getInstance().getParser();
- }
-
- /**
- * 滤掉词串中的空格、标点符号
- * @param word_list
- * @return
- */
- private String[] filter(String[] word_list){
- List<String> results = new ArrayList<String>();
- for(String w:word_list){
- if(!FILTER_CHARS.contains(w)){
- results.add(w.toLowerCase());
- }
- }
-
- return results.toArray(new String[results.size()]);
- }
-
- /**
- * 计算两个句子的相似度
- * @see ruc.irm.similarity.Similaritable
- */
- public double getSimilarity(String firstSen,String secondSen){
- //LOG.debug(segmenter.segmentToString(firstSen));
- //LOG.debug(segmenter.segmentToString(secondSen));
- String[] firstList = filter(segment(firstSen));
- String[] secondList = filter(segment(secondSen));
-
- double wordSim = getOccurrenceSimilarity(firstList,secondList);
- //LOG.debug("词形相似度="+wordSim);
-
- double orderSim = getOrderSimilarity(firstList,secondList);
- //LOG.debug("词序相似度="+orderSim);
-
- return LAMBDA1*wordSim+LAMBDA2*orderSim;
- }
-
- /**
- * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数
- * @param firstList
- * @param secondList
- * @return
- */
- public double getOccurrenceSimilarity(String[] firstList, String[] secondList){
- int max = firstList.length>secondList.length?firstList.length:secondList.length;
- if(max==0){
- return 0;
- }
-
- //首先计算出所有可能的组合
- double[][] scores = new double[max][max];
- for(int i=0; i<firstList.length; i++){
- for(int j=0; j<secondList.length; j++){
- scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
- }
- }
-
- double total_score = 0;
-
- //从scores[][]中挑选出最大的一个相似度,然后减去该元素,进一步求剩余元素中的最大相似度
- while(scores.length > 0){
- double max_score = 0;
- int max_row = 0;
- int max_col = 0;
-
- //先挑出相似度最大的一对:<row, column, max_score>
- for(int i=0; i<scores.length; i++){
- for(int j=0; j<scores.length; j++){
- if(max_score<scores[i][j]){
- max_row = i;
- max_col = j;
- max_score = scores[i][j];
- }
- }
- }
-
- //从数组中去除最大的相似度,继续挑选
- double[][] tmp_scores = new double[scores.length-1][scores.length-1];
- for(int i=0; i<scores.length; i++){
- if(i == max_row) continue;
- for(int j=0; j<scores.length; j++){
- if(j == max_col) continue;
- int tmp_i = max_row>i?i:i-1;
- int tmp_j = max_col>j?j:j-1;
- tmp_scores[tmp_i][tmp_j] = scores[i][j];
- }
- }
- total_score += max_score;
- scores = tmp_scores;
- }
-
- return (2*total_score) / (firstList.length + secondList.length);
- }
-
- /**
- * 获取两个集合的词序相似度
- * @param firstList
- * @param secondList
- * @return
- */
- public double getOrderSimilarity(String[] firstList, String[] secondList){
- double similarity = 0.0;
-
- return similarity;
- }
-
- // @SuppressWarnings("unchecked")
- // public String[] segment(String sentence){
- // MPWordSegment ws = new MPWordSegment();
- // ws.parseReader(new StringReader(sentence));
- // Vector tokens = ws.getTokens();
- // String[] results = new String[tokens.size()];
- // for(int i=0; i<tokens.size(); i++){
- // Token token = (Token)tokens.get(i);
- // results[i] = token.termText();
- // }
- //
- // return results;
- // }
-
- public String[] segment(String sentence){
- List<Word> list = SegmentProxy.segment(sentence);
- String[] results = new String[list.size()];
- for(int i=0; i<list.size(); i++){
- results[i] = list.get(i).getWord();
- }
- return results;
- }
-
- }