Screaming Loud

日々是精進

K-means

今回は"k-means"(k-平均法)を実装してみました.
k-meansに関してはネット上でたくさん説明されています.
簡単に説明すると,
前提:クラスタ数は与えられている
1,適当に各データ点をクラスに割り振る.
2,各クラスタの重心を求める
3,各データ点を求めた一番近い重心に割り振る.
4,2-3を収束するまで繰り返す.
とこんな感じです.

実装

import random
import numpy as np
from optparse import OptionParser

#D = [[0,1,3],[8,3,1.5],[10,2,1],[2,7,4],[7,1,5]]
D = [[0,1],[3,8],[3,1],[5,10],[6,1],[11,2],[2,11]]

class Iter(object):
    def __init__(self,array,k):
	self.array = array
	self.repre = []
	self.clusters = {}
	self.cluster_maker(range(1,k+1))
	self.dim = len(self.array[0])
	self.n_clus = k
	self.random_assign()

    def cluster_maker(self,index):
	self.clusters.clear()
	for i in index:
	    self.clusters.setdefault(str(i),[])

    def random_assign(self):
	'''assign cluster to array features randomly
	lists transform to ndarray'''
	for feat in self.array:
	    index = str(random.randint(1,self.n_clus))
	    self.clusters[index].append(feat)
	for index in self.clusters:
	    self.clusters[index] = np.asarray(self.clusters[index])

    def distance(self,x,y):
	'''it calculate the distance,
	but we do not need exact distance,
	so we do not extract the square root.'''
	sum = 0
	for d in range(self.dim):
	    sum += (x[d]-y[d])**2
	return sum

    def centroid(self,array):
	'''calculate centroids'''
	return np.mean(array,axis=0)
	
    def representative(self):
	'''calculate representative point'''
	self.repre = []
	for cluster in self.clusters:
	    cent = self.centroid(self.clusters[cluster])
	    self.repre.append(cent)#.tolist())

    def calc_min(self,point):
	'''calculate the distance and return the nearest point'''
	return min([(self.distance(x,point),x) for x in self.repre])

    def near_assign(self):
	'''calclate the nearest representative point'''
	self.cluster_maker(self.repre)
	for feat in self.array:
	    min_ix = str(self.calc_min(feat)[1])
	    self.clusters[min_ix].append(feat)
	for index in self.clusters:
	    self.clusters[index] = np.asarray(self.clusters[index])

def check_end(dicA,dicB):
    '''check whether changes exist or not'''
    A = dicA.keys()
    B = dicB.keys()
    if A==B:
	for key in dicA:
	    if dicA[key].all()!=dicB[key].all():
		return False
	return True
    else:
	return False

def main(array,cluster):
    '''main part of the calculation'''
    it = Iter(D,cluster)
    new = it.clusters
    old = 0
    import copy
    while(True):
	it.representative()
	old = copy.copy(new)
	it.near_assign()
	new = it.clusters
	if check_end(new,old):
	    break
    return new
    
if __name__ == '__main__':
    usage = 'usage: %prog [options]'
    parser = OptionParser(usage)
    parser.add_option('-k',dest='num_k',type='str',default=2)
    (options,args) = parser.parse_args()
    
    cluster = options.num_k
    print main(D,cluster)

追記:
本ブログではnumpyをちゃんと使っていない.numpyをちゃんと使ったもの.
numpyを使ってK-meansを書いてみた - 唯物是真 @Scaled_Wurm