[K-means clustering algorithm] Implementing iris clustering

Article directory

  • Preface
  • 1. Introduction to data sets
  • 2. Usage steps
    • 1. Guide package
    • 1.2 Load the data set
    • 1.3 Draw two-dimensional data distribution chart
    • 1.4 Instantiate the K-means class and define the training function
    • 1.5 Training
    • 1.6 Visual display
    • 2. Clustering algorithm
    • 2.1.Visual generation
    • 3 Other clustering algorithms for iris classification

Foreword

For example: With the continuous development of artificial intelligence, machine learning technology is becoming more and more important. Many people have started learning machine learning. This article introduces the basic content of machine learning.

1. Introduction to data sets

Iris data set: Iris open source data set, containing a total of 150 records

2. Usage steps

1. Guide package

import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets

1.2 Loading data set

# Get the data set directly from sklearn
iris = datasets.load_iris()
X = iris.data[:, :4] # means we take 4 dimensions in the feature space
print(X.shape)

1.3 Draw two-dimensional data distribution chart

# Take the first two dimensions (sepal length, sepal width) and draw a data distribution chart
plt.scatter(X[:, 0], X[:, 1], c="red", marker='o', label='see')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc=2)
plt.show()
# Take the last two dimensions (petal length, petal width) and draw the data distribution chart
plt.scatter(X[:, 2], X[:, 3], c="green", marker=' + ', label='see')
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.legend(loc=2)
plt.show()


1.4 Instantiate the K-means class and define the training function

def Model(n_clusters):
    estimator = KMeans(n_clusters=n_clusters)# Construct clusterer
    return estimator

def train(estimator):
    estimator.fit(X) # Clustering

1.5 training

# Initialize the instance and start training fitting
estimator=Model(4)
train(estimator)

1.6 Visual display

label_pred = estimator.labels_ # Get clustering labels
# Plot k-means results
x0 = X[label_pred == 0]
x1 = X[label_pred == 1]
x2 = X[label_pred == 2]
plt.scatter(x0[:, 0], x0[:, 1], c="red", marker='o', label='label0')
plt.scatter(x1[:, 0], x1[:, 1], c="green", marker='*', label='label1')
plt.scatter(x2[:, 0], x2[:, 1], c="blue", marker=' + ', label='label2')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc=2)
plt.show()

# Plot k-means results
x0 = X[label_pred == 0]
x1 = X[label_pred == 1]
x2 = X[label_pred == 2]
plt.scatter(x0[:, 2], x0[:, 3], c="red", marker='o', label='label0')
plt.scatter(x1[:, 2], x1[:, 3], c="green", marker='*', label='label1')
plt.scatter(x2[:, 2], x2[:, 3], c="blue", marker=' + ', label='label2')
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.legend(loc=2)
plt.show()

'''# Draw k-means results and divide them into 4 categories. The effect is not better than 3 categories.
x0 = X[label_pred == 0]
x1 = X[label_pred == 1]
x2 = X[label_pred == 2]
x3 = X[label_pred == 3]
plt.scatter(x0[:, 2], x0[:, 3], c="red", marker='o', label='label0')
plt.scatter(x1[:, 2], x1[:, 3], c="green", marker='*', label='label1')
plt.scatter(x2[:, 2], x2[:, 3], c="blue", marker=' + ', label='label2')
plt.scatter(x2[:, 2], x2[:, 3], c="yellow", marker='X', label='label3')
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.legend(loc=2)
plt.show() '''

2. Clustering algorithm

The code is as follows (example):

#1. Function distEclud(): used to calculate the distance between two vectors

def distEclud(x,y):
    return np.sqrt(np.sum((x-y)**2))
 
#2. The function randCent() is used to construct a set of k random centroids for a given data set.
def randCent(dataSet,k):

    # 3. What values are assigned to m and n respectively?
    #m=150,n=4
    m,n = dataSet.shape
    centroids = np.zeros((k,n))

    #4. Supplement the parameters in range()
    for i in range(k):

        index = int(np.random.uniform(0,m)) # Generate random numbers from 0 to 150 (randomly pick a vector in the data set as the initial value of the centroid)
        centroids[i,:] = dataSet[index,:] # Pass the four dimensions of the corresponding row to the set of centroids
    # print(centroids)
    return centroids
    
 
# k-means clustering algorithm
def KMeans(dataSet,k):
    m = np.shape(dataSet)[0] #Number of rows 150
    # The first column stores which cluster each sample belongs to (four clusters)
    #The second column stores the error of each sample to the center point of the cluster
    # print(m)
    clusterAssment = np.mat(np.zeros((m,2)))# .mat() creates a 150*2 matrix
    clusterChange=True

    # 5. The role of centroids = randCent(dataSet,k): initialize centroids
    centroids = randCent(dataSet,k)

    # 6. Add the conditions of the while loop.
    while clusterChange:

        
        clusterChange = False
        #Loop through all samples

        # 7. Supplement the parameters in range().
        for i in range(m):

            minDist = 100000.0
            minIndex = -1
            # Traverse all centroids

            #8. Supplement the parameters in range():
            for j in range(k):

                # Calculate the Euclidean distance between the sample and the three centroids and find the nearest centroid minIndex
                distance = distEclud(centroids[j,:],dataSet[i,:])
                if distance < minDist:

                    #9. Supplement minDist; minIndex assignment code
                    minDist=distance
                    #Category index
                    minIndex = j

            # Update the cluster to which the row sample belongs
            if clusterAssment[i,0] != minIndex:
                clusterChange=True
                clusterAssment[i,:] = minIndex,minDist**2
        #Update centroid
        for j in range(k):
   
            pointsInCluster = dataSet[np.nonzero(clusterAssment[:,0].A == j)[0]] # Get all the points of the corresponding cluster class (x*4)
            #10. Supplement the assignment after axis:
            centroids[j,:] = np.mean(pointsInCluster,axis=0) # Find the mean and generate a new centroid
           
    # print(clusterAssment[0:150,:])
    print("cluster complete")
    return centroids,clusterAssment

def draw(data,center,assment):
    length=len(center)
    fig=plt.figure
    data1=data[np.nonzero(assment[:,0].A == 0)[0]]
    data2=data[np.nonzero(assment[:,0].A == 1)[0]]
    data3=data[np.nonzero(assment[:,0].A == 2)[0]]
    #Select the first two dimensions to draw a scatter plot of the original data
    plt.scatter(data1[:,0],data1[:,1],c="red",marker='o',label='label0')
    plt.scatter(data2[:,0],data2[:,1],c="green", marker='*', label='label1')
    plt.scatter(data3[:,0],data3[:,1],c="blue", marker=' + ', label='label2')
    # Draw the centroid point of the cluster
    for i in range(length):
        plt.annotate('center',xy=(center[i,0],center[i,1]),xytext=\
        (center[i,0] + 1,center[i,1] + 1),arrowprops=dict(facecolor='yellow'))
        # plt.annotate('center',xy=(center[i,0],center[i,1]),xytext=\
        # (center[i,0] + 1,center[i,1] + 1),arrowprops=dict(facecolor='red'))
    plt.show()
    #Select the last two dimensions to draw a scatter plot of the original data
    plt.scatter(data1[:,2],data1[:,3],c="red",marker='o',label='label0')
    plt.scatter(data2[:,2],data2[:,3],c="green", marker='*', label='label1')
    plt.scatter(data3[:,2],data3[:,3],c="blue", marker=' + ', label='label2')
    # Draw the centroid point of the cluster
    for i in range(length):
        plt.annotate('center',xy=(center[i,2],center[i,3]),xytext=\
        (center[i,2] + 1,center[i,3] + 1),arrowprops=dict(facecolor='yellow'))
    plt.show()

2.1. Visual generation

The code is as follows (example):

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
dataSet= iris.data[:, :4]
k=3
centroids,clusterAssment = KMeans(dataSet,k)
draw(dataSet,centroids,clusterAssment)


3 Other clustering algorithms for iris classification

import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
# Get the data set directly from sklearn
iris = datasets.load_iris()
X = iris.data[:, :4] # means we take 4 dimensions in the feature space
print(X.shape)
from sklearn.cluster import DBSCAN
#Import dataset
iris = datasets.load_iris()
X = iris.data[:, :4] # Take the first four features
# Use DBSCAN clustering algorithm
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X)
# Draw classification results
plt.scatter(X[:, 0], X[:, 1], c=labels)
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('DBSCAN Clustering')
plt.show()

from sklearn.cluster import AgglomerativeClustering
# Use hierarchical clustering algorithm
hierarchical = AgglomerativeClustering(n_clusters=3)
labels = hierarchical.fit_predict(X)
# Draw classification results
plt.scatter(X[:, 0], X[:, 1], c=labels, marker=' + ')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('Hierarchical Clustering')
plt.show()