K-means algorithm and Python, C++ implementation

K-means is a commonly used clustering algorithm that aims to divide a data set into K different, non-overlapping subsets (clusters) such that each data point belongs to the center point of the nearest cluster represented by it. category.

The specific steps of the algorithm are as follows:

Initialization: Randomly select K data points as the initial cluster center (centroid).
Assign data points to the nearest cluster: For each data point, calculate its distance to K cluster centers, and assign the data point to the nearest cluster.
Update cluster center: For each cluster, calculate the average of all data points in the cluster and use this average as the new cluster center.
Repeat steps 2 and 3: Repeat steps 2 and 3 until the cluster centers no longer change or change very little, or the preset number of iterations is reached.
End: When the cluster center is stable or the maximum number of iterations is reached, the algorithm ends and the final cluster allocation result is obtained.

Python code (Iris data set)

import pandas as pd
import numpy as np
import random

#Read data
data = pd.read_csv('./Iris.csv').values
data = np.array(data)
#Intercept feature parameters and their numbers
numbers = data[:, 1]
characters = data[:, 1:-1]
labels = data[:, -1]

len_y = len(characters)
len_x = len(characters[0])

def Distance(characters, len_x, len_y, centers, k):
    Distance = []
    for data in characters:
        diff = np.tile(data, (k,1)) - centers
        squaredDiff = diff ** 2
        squaredDist = np.sum(squaredDiff, axis=1)
        distance = squaredist ** 0.5
        Distance.append(distance)
    Distance = np.array(Distance)
    return Distance


def Center(characters, centers, len_x, len_y, k):
    distance = Distance(characters, len_x, len_y, centers, k)
    min_mark = np.argmin(distance, axis=1)
    centers = pd.DataFrame(characters).groupby(min_mark).mean()
    centers = centers.values
    centers = np.array(centers)
    return centers

def K_means(numbers, characters, labels, k, len_x, len_y, epochs):
    centers = np.zeros([k, len_x])

    centers[0] = characters[random.randint(0, 49)]
    centers[1] = characters[random.randint(50, 99)]
    centers[2] = characters[random.randint(100, 149)]

    for j in range(epochs):
        centers = Center(characters, centers, len_x, len_y, k)

    centers = sorted(centers.tolist())
    distance = Distance(characters, len_x, len_y, centers, k)
    min_mark = np.argmin(distance, axis=1)
    return centers, min_mark, distance

epochs = 500
k=3
centers, min_mark, distance = K_means(numbers, characters, labels, k, len_x, len_y, epochs)
for i in range(k):
    print(f'The center of the {i + 1}th group is {centers[i]}')
for i in range(len_y):
    print(f'The {i + 1}th group of data is {data[i]}, belongs to the group {min_mark[i] + 1}, and the distance is {distance[i,min_mark[i]]}\
')</ pre>
<p><strong>C++ Code (Iris Dataset)</strong></p>
<pre>#include <iostream>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include <cstring>
#include <algorithm>
#include <string>
#include <math.h>
#include <stdlib.h>
using namespace std;

struct Flower{
int id;
double character1;
    double character2;
    double character3;
    double character4;
string labels;
int label = 0;
};

struct Distance{
struct Flower information;
int labels;
};


void GenerateCenters(int k, vector<struct Flower> FlowerVector, vector<struct Flower> & amp;random_centers)
{
vector<struct Flower> tmp(FlowerVector);

int random_index;
int n = FlowerVector.size();
\t
random_index = rand() % (50-0) + 0;
//cout << random_index << endl;
random_centers.push_back(FlowerVector.at(random_index));

random_index = rand() % (100-50) + 50;
random_centers.push_back(FlowerVector.at(random_index));

random_index = rand() % (150-100) + 100;
random_centers.push_back(FlowerVector.at(random_index));
}

double GetDistance(Flower p0, Flower p1)
{
return sqrt((p0.character1 - p1.character1)*(p0.character1 - p1.character1) + (p0.character2 - p1.character2)*(p0.character2 - p1.character2) + (p0.character3 - p1. character3)*(p0.character3 - p1.character3) + (p0.character4 - p1.character4)*(p0.character4 - p1.character4));
}

void DoKmeansCluster(vector<struct Flower> FlowerVector, vector<struct Flower> & amp;random_centers, vector<struct Distance> & amp;result)
{
vector<struct Flower> tmp(random_centers);

int point_num = FlowerVector.size();
int k = random_centers.size();

for (int p = 0; p < point_num; p + + )
{
float distance = 9999;
Distance res;
for (int q = 0; q < k; q + + )
{
float tmp_distance = GetDistance(FlowerVector[p], random_centers[q]);
if (tmp_distance < distance)
{
distance = tmp_distance;
res.labels = q;
res.information = FlowerVector[p];
}
}
result.push_back(res);
}

for (int i = 0; i < k; i + + )
{
int count = 0;
double sum_1 = 0;
double sum_2 = 0;
double sum_3 = 0;
double sum_4 = 0;
for (int j = 0; j < point_num; j + + )
{
if (result[j].labels == i)
{
count + + ;
sum_1 + = result[j].information.character1;
sum_2 + = result[j].information.character2;
sum_3 + = result[j].information.character3;
sum_4 + = result[j].information.character4;
}
}
random_centers[i].character1 = sum_1 / count;
random_centers[i].character2 = sum_2 / count;
random_centers[i].character3 = sum_3 / count;
random_centers[i].character4 = sum_4 / count;
//cout << "(" << random_centers[i].x << "," << random_centers[i].y << ")" << endl;
}


}


int main()
{
    ifstream infile("Iris.csv", ios::in);
    string line;
vector<struct Flower> FlowerVector;
    getline(infile, line);
    while (getline(infile, line))
{
stringstream ss(line);
string str;
Flower flower;

getline(ss, str, ',');
flower.id = stoi(str);
getline(ss, str, ',');
flower.character1 = stod(str);
getline(ss, str, ',');
flower.character2 = stod(str);
getline(ss, str, ',');
flower.character3 = stod(str);
getline(ss, str, ',');
flower.character4 = stod(str);
        getline(ss, str, ',');
flower.labels = str;
FlowerVector.push_back(flower);
}
    int x = FlowerVector.size();
for (int i = 0; i < x; i + + )
{
if (FlowerVector[i].labels == "Iris-setosa")
FlowerVector[i].label = 1;
if (FlowerVector[i].labels == "Iris-versicolor")
FlowerVector[i].label = 2;
if (FlowerVector[i].labels == "Iris-virginica")
FlowerVector[i].label = 3;
}
int k = 3;
int epochs = 500;
\t
vector<Flower> random_centers;
GenerateCenters(k, FlowerVector, random_centers);
vector<struct Distance> result;
for(int i=0;i<epochs;i + + )
{
DoKmeansCluster(FlowerVector, random_centers, result);
}
for(int i=0;i<k;i + + )
cout << "(" << random_centers[i].character1 << "," << random_centers[i].character2 << "," << random_centers[i].character3 << "," << random_centers[i ].character4 << ")" << endl;
for(int i=0;i<150;i + + )
cout<<i + 1<<','<<result[i].labels + 1<<endl;
}

The knowledge points of the article match the official knowledge files, and you can further learn related knowledge. Algorithm skill tree Home page Overview 55501 people are learning the system