/src/main/scala/distance/StringDistance.scala

https://bitbucket.org/pierreaubert/spark-kmeans · Scala · 46 lines · 27 code · 6 blank · 13 comment · 5 complexity · d4e3496db2c5cc06cd5d3bdac0ff0cc5 MD5 · raw file

  1. package main.distance
  2. /**
  3. * A trait to represent a String distance function
  4. */
  5. trait StringDistance extends ((String, String) => Double)
  6. /**
  7. * A companion object to the StringDistance trait to select the
  8. * Distance corresponding to each string description
  9. */
  10. object StringDistance {
  11. def apply(description: String) = description match {
  12. case "levenshtein" => Levenshtein
  13. case "normalizedLevenshtein" => NormalizedLevenshtein
  14. case _ => throw new MatchError("Invalid distance function: " + description)
  15. }
  16. }
  17. /**
  18. * Compute Levenshtein distance ( edit distance )
  19. */
  20. object Levenshtein extends StringDistance {
  21. def minimum(i1: Int, i2: Int, i3: Int)= math.min(math.min(i1, i2), i3)
  22. def apply(s1:String, s2:String)={
  23. val dist=Array.tabulate(s2.length+1, s1.length+1){(j,i)=>if(j==0) i else if (i==0) j else 0}
  24. for(j<-1 to s2.length; i<-1 to s1.length)
  25. dist(j)(i)=if(s2(j-1)==s1(i-1)) dist(j-1)(i-1)
  26. else minimum(dist(j-1)(i)+1, dist(j)(i-1)+1, dist(j-1)(i-1)+1)
  27. dist(s2.length)(s1.length)
  28. }
  29. }
  30. /**
  31. * Compute Normalized Levenshtein distance ( edit distance )
  32. */
  33. object NormalizedLevenshtein extends StringDistance {
  34. def apply(s1:String, s2:String)={
  35. val d = StringDistance("levenshtein")(s1,s2)
  36. val v = d / math.max(math.max(s1.size, s2.size),1)
  37. require(v <= 1, "Distance between 0 and 1")
  38. v
  39. }
  40. }