PageRenderTime 51ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/cluster.py

https://github.com/bwaldmann/twitter_geolocation
Python | 159 lines | 92 code | 20 blank | 47 comment | 24 complexity | 106f4d0981ad2e15dbbd12c5c5ebbef1 MD5 | raw file
  1. #!/usr/bin/env python
  2. # taken from http://www.daniweb.com/forums/thread31449.html
  3. # clustering.py contains classes and functions that cluster data points
  4. import sys, math, random
  5. from sphereDist import dist_on_earth
  6. # -- The Point class represents points in n-dimensional space
  7. class Point:
  8. # Instance variables
  9. # self.coords is a list of coordinates for this Point
  10. # self.n is the number of dimensions this Point lives in (ie, its space)
  11. # self.reference is an object bound to this Point
  12. # Initialize new Points
  13. def __init__(self, coords, reference=None):
  14. self.coords = coords
  15. self.n = len(coords)
  16. self.reference = reference
  17. # Return a string representation of this Point
  18. def __repr__(self):
  19. return str(self.coords)
  20. # -- The Cluster class represents clusters of points in n-dimensional space
  21. class Cluster:
  22. # Instance variables
  23. # self.points is a list of Points associated with this Cluster
  24. # self.n is the number of dimensions this Cluster's Points live in
  25. # self.centroid is the sample mean Point of this Cluster
  26. def __init__(self, points):
  27. # We forbid empty Clusters (they don't make mathematical sense!)
  28. if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER")
  29. self.points = points
  30. self.n = points[0].n
  31. # We also forbid Clusters containing Points in different spaces
  32. # Ie, no Clusters with 2D Points and 3D Points
  33. for p in points:
  34. if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER")
  35. # Figure out what the centroid of this Cluster should be
  36. self.centroid = self.calculateCentroid()
  37. # Return a string representation of this Cluster
  38. def __repr__(self):
  39. return str(self.points)
  40. # Update function for the <strong class="highlight">K-means</strong> algorithm
  41. # Assigns a new list of Points to this Cluster, returns centroid difference
  42. def update(self, points):
  43. old_centroid = self.centroid
  44. self.points = points
  45. self.centroid = self.calculateCentroid()
  46. return dist_on_earth(old_centroid.coords, self.centroid.coords)
  47. # Calculates the centroid Point - the centroid is the sample mean Point
  48. # (in plain English, the average of all the Points in the Cluster)
  49. def calculateCentroid(self):
  50. centroid_coords = []
  51. # For each coordinate:
  52. for i in range(self.n):
  53. # Take the average across all Points
  54. centroid_coords.append(0.0)
  55. for p in self.points:
  56. centroid_coords[i] = centroid_coords[i]+p.coords[i]
  57. try:
  58. centroid_coords[i] = centroid_coords[i]/len(self.points)
  59. except:
  60. continue
  61. # Return a Point object using the average coordinates
  62. return Point(centroid_coords)
  63. # -- Return Clusters of Points formed by <strong class="highlight">K-means</strong> clustering
  64. def kmeans(points, k, cutoff):
  65. tmp = []
  66. for p in points:
  67. print p
  68. try:
  69. lat = float(p[1])
  70. lon = float(p[2])
  71. tmp.append(Point([lat,lon]))
  72. except:
  73. continue
  74. points = tmp
  75. # Randomly sample k Points from the points list, build Clusters around them
  76. initial = random.sample(points, k)
  77. clusters = []
  78. for p in initial: clusters.append(Cluster([p]))
  79. print " clusters: %s" % clusters
  80. # Enter the program loop
  81. while True:
  82. # Make a list for each Cluster
  83. lists = []
  84. for c in clusters: lists.append([])
  85. # For each Point:
  86. for p in points:
  87. # Figure out which Cluster's centroid is the nearest
  88. smallest_distance = dist_on_earth(p.coords, clusters[0].centroid.coords)
  89. index = 0
  90. for i in range(len(clusters[1:])):
  91. distance = dist_on_earth(p.coords, clusters[i+1].centroid.coords)
  92. if distance < smallest_distance:
  93. smallest_distance = distance
  94. index = i+1
  95. # Add this Point to that Cluster's corresponding list
  96. lists[index].append(p)
  97. # Update each Cluster with the corresponding list
  98. # Record the biggest centroid shift for any Cluster
  99. biggest_shift = 0.0
  100. for i in range(len(clusters)):
  101. shift = clusters[i].update(lists[i])
  102. biggest_shift = max(biggest_shift, shift)
  103. # If the biggest centroid shift is less than the cutoff, stop
  104. if biggest_shift < cutoff: break
  105. tmp = []
  106. for c in clusters:
  107. tmp.append([len(c.points),c.centroid.coords])
  108. return tmp
  109. # Return the list of cluster attributes
  110. return tmp
  111. # -- Get the Euclidean distance between two Points
  112. def getDistance(a, b):
  113. # Forbid measurements between Points in different spaces
  114. if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS")
  115. # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i)
  116. ret = 0.0
  117. for i in range(a.n):
  118. ret = ret+pow((a.coords[i]-b.coords[i]), 2)
  119. return math.sqrt(ret)
  120. # -- Create a random Point in n-dimensional space
  121. def makeRandomPoint(n, lower, upper):
  122. coords = []
  123. for i in range(n): coords.append(random.uniform(lower, upper))
  124. return Point(coords)
  125. # -- Main function
  126. def main(args):
  127. num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200
  128. # Create num_points random Points in n-dimensional space
  129. points = []
  130. for i in range(num_points): points.append(makeRandomPoint(n, lower, upper))
  131. # Cluster the points using the <strong class="highlight">K-means</strong> algorithm
  132. clusters = kmeans(points, k, cutoff)
  133. # Print the results
  134. print "\nPOINTS:"
  135. for p in points: print "P:", p
  136. print "\nCLUSTERS:"
  137. for c in clusters: print "C:", c
  138. # -- The following <strong class="highlight">code</strong> executes upon command-line invocation
  139. if __name__ == "__main__": main(sys.argv)