patternpythonMinor
k-means clustering algorithm implementation
Viewed 0 times
clusteringalgorithmimplementationmeans
Problem
Here is my personal implementation of the clustering k-means algorithm.
```
from scipy.spatial import distance
import numpy as np
import random
# (x,y) coordinates of a point
X = 0
Y = 1
def get_first(k, points):
return points[0:k]
def cost(cetroids, clusters):
cost = 0
for i in range(len(centroids)):
centroid = centroids[i]
cluster = clusters[i]
for point in cluster:
cost += (distance.euclidean(centroid, point))**2
return cost
def compute_centroids(clusters):
centroids = []
for cluster in clusters:
centroids.append(np.mean(cluster, axis=0))
return centroids
def kmeans(k, centroids, points, method, iter):
clusters = [[] for i in range(k)]
for i in range(len(points)):
point = points[i]
belongs_to_cluster = closest_centroid(point, centroids)
clusters[belongs_to_cluster].append(point)
new_centroids = compute_centroids(clusters)
if not equals(centroids, new_centroids):
print "Iteration " + str(iter) + ". Cost [k=" + str(k) + ", " + method + "] = " + str(cost(new_centroids, clusters))
clusters = kmeans(k, new_centroids, points, method, iter+1)
return clusters
def closest_centroid(point, centroids):
min_distance = float('inf')
belongs_to_cluster = None
for j in range(len(centroids)):
centroid = centroids[j]
dist = distance.euclidean(point, centroid)
if dist < min_distance:
min_distance = dist
belongs_to_cluster = j
return belongs_to_cluster
def contains(point1, points):
for point2 in points:
if point1[X] == point2[X] and point1[Y] == point2[Y]:
return True
return False
def equals(points1, points2):
if len(points1) != len(points2):
return False
for i in range (len(points1)):
point1 = points1[i]
point2 = points2[i]
if point1[X] != point2[X] or point1[Y] != point2[Y]:
return False
```
from scipy.spatial import distance
import numpy as np
import random
# (x,y) coordinates of a point
X = 0
Y = 1
def get_first(k, points):
return points[0:k]
def cost(cetroids, clusters):
cost = 0
for i in range(len(centroids)):
centroid = centroids[i]
cluster = clusters[i]
for point in cluster:
cost += (distance.euclidean(centroid, point))**2
return cost
def compute_centroids(clusters):
centroids = []
for cluster in clusters:
centroids.append(np.mean(cluster, axis=0))
return centroids
def kmeans(k, centroids, points, method, iter):
clusters = [[] for i in range(k)]
for i in range(len(points)):
point = points[i]
belongs_to_cluster = closest_centroid(point, centroids)
clusters[belongs_to_cluster].append(point)
new_centroids = compute_centroids(clusters)
if not equals(centroids, new_centroids):
print "Iteration " + str(iter) + ". Cost [k=" + str(k) + ", " + method + "] = " + str(cost(new_centroids, clusters))
clusters = kmeans(k, new_centroids, points, method, iter+1)
return clusters
def closest_centroid(point, centroids):
min_distance = float('inf')
belongs_to_cluster = None
for j in range(len(centroids)):
centroid = centroids[j]
dist = distance.euclidean(point, centroid)
if dist < min_distance:
min_distance = dist
belongs_to_cluster = j
return belongs_to_cluster
def contains(point1, points):
for point2 in points:
if point1[X] == point2[X] and point1[Y] == point2[Y]:
return True
return False
def equals(points1, points2):
if len(points1) != len(points2):
return False
for i in range (len(points1)):
point1 = points1[i]
point2 = points2[i]
if point1[X] != point2[X] or point1[Y] != point2[Y]:
return False
Solution
N.b. I'm pretty sure you can find optimised replacements of many of
these functions in either the NumPy or SciPy library.
Edit: I don't immediately see a reason for a slowdown except the
addition of new clusters. A bigger sample set that shows the behaviour
would be nice, together with some timings and probably running a
profiler on it.
In general you should likely use a NumPy array or matrix instead of
lists of lists as they are optimised for storing numeric data(!). That
would likely also eliminate the need for some of these functions or
considerably reduce their length. Also store all data in the same
containers - that way you don't have to create new functions like
representations.
get rid of that by writing code that's either independent on the
number of dimensions, or just use
to warrant separate constants.
require encapsulation in a function. The zero is also optional.
should be replaced with proper iteration over some helper generator.
E.g. in
used in the same way). It can also be simplified by using another
SciPy function,
exact value isn't relevant, just the relation between different
distances.
further purpose.
use one of the various formatting options (
of the additional
Looks like this now:
`from scipy.spatial import distance
import numpy as np
import random
from itertools import izip
def cost(centroids, clusters):
return sum(distance.cdist([centroid], cluster, 'sqeuclidean').sum()
for centroid, cluster in izip(centroids, clusters))
def compute_centroids(clusters):
return [np.mean(cluster, axis=0) for cluster in clusters]
def kmeans(k, centroids, points, method):
clusters = [[] for _ in range(k)]
for point in points:
clusters[closest_centroid(point, centroids)].append(point)
new_centroids = compute_centroids(clusters)
if not equals(centroids, new_centroids):
print("cost [k={}, {}] = {}".format(k, method, cost(new_centroids, clusters)))
clusters = kmeans(k, new_centroids, points, method)
return clusters
def closest_centroid(point, centroids):
min_distance = float('inf')
belongs_to_cluster = None
for j, centroid in enumerate(centroids):
dist = distance.sqeuclidean(point, centroid)
if dist
these functions in either the NumPy or SciPy library.
Edit: I don't immediately see a reason for a slowdown except the
addition of new clusters. A bigger sample set that shows the behaviour
would be nice, together with some timings and probably running a
profiler on it.
In general you should likely use a NumPy array or matrix instead of
lists of lists as they are optimised for storing numeric data(!). That
would likely also eliminate the need for some of these functions or
considerably reduce their length. Also store all data in the same
containers - that way you don't have to create new functions like
equals and contains yourself to compare between differentrepresentations.
- The
X/Ydefinitions at the start are more of a WTF for me. I'd
get rid of that by writing code that's either independent on the
number of dimensions, or just use
0/1 - IMO it's not magic enoughto warrant separate constants.
get_firstisn't neccessary - the pattern is obvious enough to not
require encapsulation in a function. The zero is also optional.
- Typo in
costsignature, should becentroids.
- The pattern
for i in range(len(...)):appears a couple of times and
should be replaced with proper iteration over some helper generator.
E.g. in
cost the parallel iteration over both centroids andclusters should be done by zip (itertools.izip in Python 2.7 ifused in the same way). It can also be simplified by using another
SciPy function,
cdist.- If you still need the index, use
enumerate.
- You can also often get away with using the squared euclidean if the
exact value isn't relevant, just the relation between different
distances.
- The pattern to initialise a list of empty lists should probably use
_ instead of i to make it obvious that the variable serves tofurther purpose.
printshould be used as a function to make it more uniform. Also,
use one of the various formatting options (
% or format) to get ridof the additional
str calls.compute_centroidscan be simplified with a list comprehension.
Looks like this now:
`from scipy.spatial import distance
import numpy as np
import random
from itertools import izip
def cost(centroids, clusters):
return sum(distance.cdist([centroid], cluster, 'sqeuclidean').sum()
for centroid, cluster in izip(centroids, clusters))
def compute_centroids(clusters):
return [np.mean(cluster, axis=0) for cluster in clusters]
def kmeans(k, centroids, points, method):
clusters = [[] for _ in range(k)]
for point in points:
clusters[closest_centroid(point, centroids)].append(point)
new_centroids = compute_centroids(clusters)
if not equals(centroids, new_centroids):
print("cost [k={}, {}] = {}".format(k, method, cost(new_centroids, clusters)))
clusters = kmeans(k, new_centroids, points, method)
return clusters
def closest_centroid(point, centroids):
min_distance = float('inf')
belongs_to_cluster = None
for j, centroid in enumerate(centroids):
dist = distance.sqeuclidean(point, centroid)
if dist
Context
StackExchange Code Review Q#128315, answer score: 3
Revisions (0)
No revisions yet.