Article directory
- Preface
- 1. Introduction to data sets
- 2. Usage steps
-
- 1. Guide package
- 1.2 Load the data set
- 1.3 Draw two-dimensional data distribution chart
- 1.4 Instantiate the K-means class and define the training function
- 1.5 Training
- 1.6 Visual display
- 2. Clustering algorithm
- 2.1.Visual generation
- 3 Other clustering algorithms for iris classification
Foreword
For example: With the continuous development of artificial intelligence, machine learning technology is becoming more and more important. Many people have started learning machine learning. This article introduces the basic content of machine learning.
1. Introduction to data sets
Iris data set: Iris open source data set, containing a total of 150 records
2. Usage steps
1. Guide package
import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans from sklearn import datasets
1.2 Loading data set
# Get the data set directly from sklearn iris = datasets.load_iris() X = iris.data[:, :4] # means we take 4 dimensions in the feature space print(X.shape)
1.3 Draw two-dimensional data distribution chart
# Take the first two dimensions (sepal length, sepal width) and draw a data distribution chart plt.scatter(X[:, 0], X[:, 1], c="red", marker='o', label='see') plt.xlabel('sepal length') plt.ylabel('sepal width') plt.legend(loc=2) plt.show() # Take the last two dimensions (petal length, petal width) and draw the data distribution chart plt.scatter(X[:, 2], X[:, 3], c="green", marker=' + ', label='see') plt.xlabel('petal length') plt.ylabel('petal width') plt.legend(loc=2) plt.show()
1.4 Instantiate the K-means class and define the training function
def Model(n_clusters): estimator = KMeans(n_clusters=n_clusters)# Construct clusterer return estimator def train(estimator): estimator.fit(X) # Clustering
1.5 training
# Initialize the instance and start training fitting estimator=Model(4) train(estimator)
1.6 Visual display
label_pred = estimator.labels_ # Get clustering labels # Plot k-means results x0 = X[label_pred == 0] x1 = X[label_pred == 1] x2 = X[label_pred == 2] plt.scatter(x0[:, 0], x0[:, 1], c="red", marker='o', label='label0') plt.scatter(x1[:, 0], x1[:, 1], c="green", marker='*', label='label1') plt.scatter(x2[:, 0], x2[:, 1], c="blue", marker=' + ', label='label2') plt.xlabel('sepal length') plt.ylabel('sepal width') plt.legend(loc=2) plt.show() # Plot k-means results x0 = X[label_pred == 0] x1 = X[label_pred == 1] x2 = X[label_pred == 2] plt.scatter(x0[:, 2], x0[:, 3], c="red", marker='o', label='label0') plt.scatter(x1[:, 2], x1[:, 3], c="green", marker='*', label='label1') plt.scatter(x2[:, 2], x2[:, 3], c="blue", marker=' + ', label='label2') plt.xlabel('petal length') plt.ylabel('petal width') plt.legend(loc=2) plt.show() '''# Draw k-means results and divide them into 4 categories. The effect is not better than 3 categories. x0 = X[label_pred == 0] x1 = X[label_pred == 1] x2 = X[label_pred == 2] x3 = X[label_pred == 3] plt.scatter(x0[:, 2], x0[:, 3], c="red", marker='o', label='label0') plt.scatter(x1[:, 2], x1[:, 3], c="green", marker='*', label='label1') plt.scatter(x2[:, 2], x2[:, 3], c="blue", marker=' + ', label='label2') plt.scatter(x2[:, 2], x2[:, 3], c="yellow", marker='X', label='label3') plt.xlabel('petal length') plt.ylabel('petal width') plt.legend(loc=2) plt.show() '''
2. Clustering algorithm
The code is as follows (example):
#1. Function distEclud(): used to calculate the distance between two vectors def distEclud(x,y): return np.sqrt(np.sum((x-y)**2)) #2. The function randCent() is used to construct a set of k random centroids for a given data set. def randCent(dataSet,k): # 3. What values are assigned to m and n respectively? #m=150,n=4 m,n = dataSet.shape centroids = np.zeros((k,n)) #4. Supplement the parameters in range() for i in range(k): index = int(np.random.uniform(0,m)) # Generate random numbers from 0 to 150 (randomly pick a vector in the data set as the initial value of the centroid) centroids[i,:] = dataSet[index,:] # Pass the four dimensions of the corresponding row to the set of centroids # print(centroids) return centroids # k-means clustering algorithm def KMeans(dataSet,k): m = np.shape(dataSet)[0] #Number of rows 150 # The first column stores which cluster each sample belongs to (four clusters) #The second column stores the error of each sample to the center point of the cluster # print(m) clusterAssment = np.mat(np.zeros((m,2)))# .mat() creates a 150*2 matrix clusterChange=True # 5. The role of centroids = randCent(dataSet,k): initialize centroids centroids = randCent(dataSet,k) # 6. Add the conditions of the while loop. while clusterChange: clusterChange = False #Loop through all samples # 7. Supplement the parameters in range(). for i in range(m): minDist = 100000.0 minIndex = -1 # Traverse all centroids #8. Supplement the parameters in range(): for j in range(k): # Calculate the Euclidean distance between the sample and the three centroids and find the nearest centroid minIndex distance = distEclud(centroids[j,:],dataSet[i,:]) if distance < minDist: #9. Supplement minDist; minIndex assignment code minDist=distance #Category index minIndex = j # Update the cluster to which the row sample belongs if clusterAssment[i,0] != minIndex: clusterChange=True clusterAssment[i,:] = minIndex,minDist**2 #Update centroid for j in range(k): pointsInCluster = dataSet[np.nonzero(clusterAssment[:,0].A == j)[0]] # Get all the points of the corresponding cluster class (x*4) #10. Supplement the assignment after axis: centroids[j,:] = np.mean(pointsInCluster,axis=0) # Find the mean and generate a new centroid # print(clusterAssment[0:150,:]) print("cluster complete") return centroids,clusterAssment def draw(data,center,assment): length=len(center) fig=plt.figure data1=data[np.nonzero(assment[:,0].A == 0)[0]] data2=data[np.nonzero(assment[:,0].A == 1)[0]] data3=data[np.nonzero(assment[:,0].A == 2)[0]] #Select the first two dimensions to draw a scatter plot of the original data plt.scatter(data1[:,0],data1[:,1],c="red",marker='o',label='label0') plt.scatter(data2[:,0],data2[:,1],c="green", marker='*', label='label1') plt.scatter(data3[:,0],data3[:,1],c="blue", marker=' + ', label='label2') # Draw the centroid point of the cluster for i in range(length): plt.annotate('center',xy=(center[i,0],center[i,1]),xytext=\ (center[i,0] + 1,center[i,1] + 1),arrowprops=dict(facecolor='yellow')) # plt.annotate('center',xy=(center[i,0],center[i,1]),xytext=\ # (center[i,0] + 1,center[i,1] + 1),arrowprops=dict(facecolor='red')) plt.show() #Select the last two dimensions to draw a scatter plot of the original data plt.scatter(data1[:,2],data1[:,3],c="red",marker='o',label='label0') plt.scatter(data2[:,2],data2[:,3],c="green", marker='*', label='label1') plt.scatter(data3[:,2],data3[:,3],c="blue", marker=' + ', label='label2') # Draw the centroid point of the cluster for i in range(length): plt.annotate('center',xy=(center[i,2],center[i,3]),xytext=\ (center[i,2] + 1,center[i,3] + 1),arrowprops=dict(facecolor='yellow')) plt.show()
2.1. Visual generation
The code is as follows (example):
import matplotlib.pyplot as plt import numpy as np from sklearn import datasets iris = datasets.load_iris() dataSet= iris.data[:, :4] k=3 centroids,clusterAssment = KMeans(dataSet,k) draw(dataSet,centroids,clusterAssment)
3 Other clustering algorithms for iris classification
import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans from sklearn import datasets
# Get the data set directly from sklearn iris = datasets.load_iris() X = iris.data[:, :4] # means we take 4 dimensions in the feature space print(X.shape)
from sklearn.cluster import DBSCAN #Import dataset iris = datasets.load_iris() X = iris.data[:, :4] # Take the first four features # Use DBSCAN clustering algorithm dbscan = DBSCAN(eps=0.5, min_samples=5) labels = dbscan.fit_predict(X) # Draw classification results plt.scatter(X[:, 0], X[:, 1], c=labels) plt.xlabel('Sepal Length') plt.ylabel('Sepal Width') plt.title('DBSCAN Clustering') plt.show()
from sklearn.cluster import AgglomerativeClustering # Use hierarchical clustering algorithm hierarchical = AgglomerativeClustering(n_clusters=3) labels = hierarchical.fit_predict(X) # Draw classification results plt.scatter(X[:, 0], X[:, 1], c=labels, marker=' + ') plt.xlabel('Sepal Length') plt.ylabel('Sepal Width') plt.title('Hierarchical Clustering') plt.show()