snippetpythonMinor
KMeans in the shortest and most readable format
Viewed 0 times
formatthereadableshortestkmeansandmost
Problem
I'm learning Python (coming from Java) so I decided to write KMeans as a practice for the language. However I want to see how could one improve the code and making it shorter and yet readable. I still find the code rather long. Also if you have comments regarding conventions or proper practices, I would really appreciate it.
```
import numpy as np
from numpy import linalg as LA
def main():
#data = np.arange(20).reshape( (4,5) )
file_name = "/Users/x/Desktop/tst.csv"
with open(file_name) as f:
ncols = len(f.readline().split(","))
data = np.loadtxt(file_name, delimiter=",", usecols=range(0,ncols-1))
kmeans = KMeans()
kmeans.cluster(data, 3, 200)
print kmeans.means
class KMeans:
def __init__(self):
self.means = None
self.data_assignments = None
#data is a 2D Numpy array
def cluster(self, data, numClusters, iterations):
if numClusters data.shape[0]:
raise Exception("The number of clusters is beyond the number of rows.")
#Pick random means
randomMeansIndices = np.random.choice( range( data.shape[0] ),
size=numClusters, replace=False )
for i in randomMeansIndices:
if self.means is not None:
self.means = np.vstack( ( self.means, data[ i ] ) )
else:
self.means = data[ i ]
for iteration in xrange(iterations):
#Data assignment
self.data_assignments = {}
for row in data:
distances = {}
meanID = 0
for mean in self.means:
distances[meanID] = LA.norm(mean - row)
meanID += 1
nearestMean = min( distances, key=lambda x: distances[x] )
if nearestMean in self.data_assignments:
self.data_assignments[nearestMean] = np.vstack( (self.data_assignments[nearestMean], row) )
```
import numpy as np
from numpy import linalg as LA
def main():
#data = np.arange(20).reshape( (4,5) )
file_name = "/Users/x/Desktop/tst.csv"
with open(file_name) as f:
ncols = len(f.readline().split(","))
data = np.loadtxt(file_name, delimiter=",", usecols=range(0,ncols-1))
kmeans = KMeans()
kmeans.cluster(data, 3, 200)
print kmeans.means
class KMeans:
def __init__(self):
self.means = None
self.data_assignments = None
#data is a 2D Numpy array
def cluster(self, data, numClusters, iterations):
if numClusters data.shape[0]:
raise Exception("The number of clusters is beyond the number of rows.")
#Pick random means
randomMeansIndices = np.random.choice( range( data.shape[0] ),
size=numClusters, replace=False )
for i in randomMeansIndices:
if self.means is not None:
self.means = np.vstack( ( self.means, data[ i ] ) )
else:
self.means = data[ i ]
for iteration in xrange(iterations):
#Data assignment
self.data_assignments = {}
for row in data:
distances = {}
meanID = 0
for mean in self.means:
distances[meanID] = LA.norm(mean - row)
meanID += 1
nearestMean = min( distances, key=lambda x: distances[x] )
if nearestMean in self.data_assignments:
self.data_assignments[nearestMean] = np.vstack( (self.data_assignments[nearestMean], row) )
Solution
You use
For example this
could be done in one line, as you can index arrays with arrays:
This code could avoid the loop
by computing all the norms at once (NumPy 1.8 required).
The
vstack a lot to build arrays row by row, with the additional complication of initializing the array variable to None, which needs a special case in your loops then. This complication could be avoided by intializing to zero-height 2D array instead, but in any case repeated vstack use is inefficient because it copies the array every time. Better options are:- Collect rows in a list and
vstackthe list in the end. Good approach when you don't know the height of the array in advance.
- Initialize array to right size and assign into rows.
- Best of all, create array at once by a vectorized operation.
For example this
self.means = None
for i in randomMeansIndices:
if self.means is not None:
self.means = np.vstack( ( self.means, data[ i ] ) )
else:
self.means = data[ i ]could be done in one line, as you can index arrays with arrays:
self.means = data[randomMeansIndices]This code could avoid the loop
distances = {}
meanID = 0
for mean in self.means:
distances[meanID] = LA.norm(mean - row)
meanID += 1
nearestMean = min( distances, key=lambda x: distances[x] )by computing all the norms at once (NumPy 1.8 required).
argmin gives the index of the minimum.distances = LA.norm(self.means - row, axis=1)
nearestMean = np.argmin(distances)The
KMeans class has no purpose other than holding the result of clustering. The cluster method could be a stand-alone function instead and just return means, data_assignments. If you prefer to access the two items by name, use a collections.namedtuple as the return value.Code Snippets
self.means = None
for i in randomMeansIndices:
if self.means is not None:
self.means = np.vstack( ( self.means, data[ i ] ) )
else:
self.means = data[ i ]self.means = data[randomMeansIndices]distances = {}
meanID = 0
for mean in self.means:
distances[meanID] = LA.norm(mean - row)
meanID += 1
nearestMean = min( distances, key=lambda x: distances[x] )distances = LA.norm(self.means - row, axis=1)
nearestMean = np.argmin(distances)Context
StackExchange Code Review Q#43253, answer score: 2
Revisions (0)
No revisions yet.