From 7f860c41cd3dc5f03d236f83250b2153b7ff6bf6 Mon Sep 17 00:00:00 2001
From: Akdag <akda_me@sc-030332l.intra.dlr.de>
Date: Tue, 17 Dec 2024 12:48:39 +0100
Subject: [PATCH] Refined comments for better readability

---
 heat/cluster/_kcluster.py |  63 ++++++-------
 heat/cluster/mytest.py    | 180 --------------------------------------
 2 files changed, 28 insertions(+), 215 deletions(-)
 delete mode 100644 heat/cluster/mytest.py

diff --git a/heat/cluster/_kcluster.py b/heat/cluster/_kcluster.py
index 6029cc721..c94cf7e04 100644
--- a/heat/cluster/_kcluster.py
+++ b/heat/cluster/_kcluster.py
@@ -137,7 +137,8 @@ def _initialize_cluster_centers(
         elif self.init == "probability_based":
             # First, check along which axis the data is sliced
             if x.split is None or x.split == 0:
-                # Define a list of random, uniformly distributed probabilities, which is later used to sample the centroids
+                # Define a list of random, uniformly distributed probabilities,
+                # which is later used to sample the centroids
                 sample = ht.random.rand(x.shape[0], split=x.split)
                 # Define a random integer serving as a label to pick the first centroid randomly
                 init_idx = ht.random.randint(0, x.shape[0] - 1).item()
@@ -146,49 +147,43 @@ def _initialize_cluster_centers(
                 # We assume that the centroids fit into the memory of a single GPU
                 centroids = ht.expand_dims(x[init_idx, :].resplit_(None), axis=0)
                 # Calculate the initial cost of the clustering after the first centroid selection
-                # and use it as an indicator for the number of necessary iterations
-                # --> First calculate the Euclidean distance between data points x and initial centroids
-                # output format: tensor
+                # and use it as an indicator for the order of magnitude for the number of necessary iterations
                 init_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
+                # --> init_distance calculates the Euclidean distance between data points x and initial centroids
+                # output format: tensor
+                init_min_distance = init_distance.min(axis=1)
                 # --> Pick the minimal distance of the data points to each centroid
                 # output format: vector
-                init_min_distance = init_distance.min(axis=1)
+                init_cost = init_min_distance.sum()
                 # --> Now calculate the cost
                 # output format: scalar
-                init_cost = init_min_distance.sum()
+                #
                 # Iteratively fill the tensor storing the centroids
                 for _ in ht.arange(0, iter_multiplier * ht.log(init_cost)):
                     # Calculate the distance between data points and the current set of centroids
                     distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
                     min_distance = distance.min(axis=1)
                     # Sample each point in the data to a new set of centroids
+                    prob = oversampling * min_distance / min_distance.sum()
                     # -->   probability distribution with oversampling factor
                     #       output format: vector
-                    prob = oversampling * min_distance / min_distance.sum()
+                    idx = ht.where(sample <= prob)
                     # -->   choose indices to sample the data according to prob
                     #       output format: vector
-                    idx = ht.where(sample <= prob)
+                    local_data = x[idx].resplit_(centroids.split)
+                    # -->   pick the data points that are identified as possible centroids and make sure
+                    #       that data points and centroids are split in the same way
+                    #       output format: vector
+                    centroids = ht.row_stack((centroids, local_data))
                     # -->   stack the data points with these indices to the DNDarray of centroids
                     #       output format: tensor
-                    """print(f"idx={idx}")
-                    if idx.shape[0]!=0:
-                        print(f"idx={idx}, idx.shape={idx.shape}, x[idx]={x[idx]}")
-                        local_data= x[idx].resplit_(centroids.split) # make sure, that the data points we append to centroids are split in the same way
-                        centroids=ht.row_stack((centroids,local_data)) """
-                    # print(f"x[idx]={x[idx]}, x[idx].shape={x[idx].shape}, process= {ht.MPI_WORLD.rank}\n")
-                    # print(f"centroids.split={centroids.split}, process= {ht.MPI_WORLD.rank}\n")
-                    # if idx.shape[0]!=0:
-                    local_data = x[idx].resplit_(
-                        centroids.split
-                    )  # make sure, that the data points we append to centroids are split in the same way
-                    # local_data=x[idx]
-                    # print(f"x[1]={x[1]}, local_data={local_data}, process= {ht.MPI_WORLD.rank}\n")
-                    centroids = ht.row_stack((centroids, local_data))
                 # Evaluate distance between final centroids and data points
                 if centroids.shape[0] <= self.n_clusters:
                     raise ValueError(
-                        "The oversampling factor and/or the number of iterations are chosen two small for the initialization of cluster centers."
+                        "The oversampling factor and/or the number of iterations are chosen"
+                        "too small for the initialization of cluster centers."
                     )
+                # Evaluate the distance between data and the final set of centroids for the initialization
                 final_distance = ht.spatial.distance.cdist(x, centroids, quadratic_expansion=True)
                 # For each data point in x, find the index of the centroid that is closest
                 final_idx = ht.argmin(final_distance, axis=1)
@@ -199,12 +194,11 @@ def _initialize_cluster_centers(
                     weights[i] = ht.sum(final_idx == i)
                 # Recluster the oversampled centroids using standard k-means ++ (here we use the
                 # already implemented version in torch)
-                # --> first transform relevant arrays into torch tensors
                 centroids = centroids.resplit_(None)
                 centroids = centroids.larray
                 weights = weights.resplit_(None)
                 weights = weights.larray
-                # --> apply k-means ++
+                # --> first transform relevant arrays into torch tensors
                 if ht.MPI_WORLD.rank == 0:
                     batch_kmeans = _kmex(
                         centroids,
@@ -216,28 +210,27 @@ def _initialize_cluster_centers(
                         random_state=None,
                         weights=weights,
                     )
-                    reclustered_centroids = batch_kmeans[0]  # access the reclustered centroids
+                    # --> apply standard k-means ++
+                    #     Note: as we only recluster the centroids for initialization with standard k-means ++,
+                    #     this list of centroids can also be used to initialize k-medians and k-medoids
+                    reclustered_centroids = batch_kmeans[0]
+                    # --> access the reclustered centroids
                 else:
                     # ensure that all processes have the same data
-                    # tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory with the correct type (necessary for broadcast)
                     reclustered_centroids = torch.zeros(
                         (self.n_clusters, centroids.shape[1]),
                         dtype=x.dtype.torch_type(),
                         device=centroids.device,
                     )
+                    # -->  tensor with zeros that has the same size as reclustered centroids, in order to to
+                    #      allocate memory with the correct type in all processes(necessary for broadcast)
                 ht.MPI_WORLD.Bcast(
                     reclustered_centroids, root=0
                 )  # by default it is broadcasted from process 0
-                # -------------------------------------------------------------------------------
-                # print(f"reclustered centroids in initilialize_cluster_centers (after applying kmex)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n")
-                # -------------------------------------------------------------------------------
-                # --> transform back to DNDarray
                 reclustered_centroids = ht.array(reclustered_centroids, split=x.split)
-                # final result
+                # --> transform back to DNDarray
                 self._cluster_centers = reclustered_centroids
-                # -------------------------------------------------------------------------------
-                # print(f"reclustered centroids in initilialize_cluster_centers (final result)={reclustered_centroids}, process= {ht.MPI_WORLD.rank}\n")
-                # -------------------------------------------------------------------------------
+                # --> final result for initialized cluster centers
             else:
                 raise NotImplementedError("Not implemented for other splitting-axes")
 
diff --git a/heat/cluster/mytest.py b/heat/cluster/mytest.py
deleted file mode 100644
index 6a5783fa0..000000000
--- a/heat/cluster/mytest.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Some tests to check the funtionality of the k-means clustering algortihm
-"""
-
-import heat as ht
-import numpy as np
-import torch
-import time
-
-ht.use_device("gpu")
-# Convert data into DNDarrays
-# The shape of this data is (3,5), i.e.,
-# 3 data points, each consisting of 5 features
-x = [[1, 2, 3, 4, 5], [10, 20, 30, 40, 50], [0, 2, 3, 4, 4]]
-unit = ht.ones((3, 5), split=None)
-unitvector = ht.ones((1, 5), split=None)
-v = [[20, 30, 40, 5, 6], [11, 22, 33, 44, 55], [102, 204, 303, 406, 507], [30, 44, 53, 66, 77]]
-y = ht.array(x)
-w = ht.array(v)
-# Split the data along different axes
-y0 = ht.array(x, split=0)
-y1 = ht.array(x, split=1)
-# Convert data, labels, and centers from heat tensors to numpy arrays
-# larray
-y_as_np = y0.resplit_(None).larray.cpu().numpy()
-# output the shape
-y_shape0 = y0.shape
-# print the number of features in each data point
-n_features = y0.shape[1]
-# calculate Euclidean distance between each
-# row-vector in y and w
-# !!! Important !!!
-# ---> the arguments of cdist must be 2D tensors, i.e., ht.array([[1,2,3]]) instead of ht.array([1,2,3])
-dist = ht.spatial.distance.cdist(y, w)
-# pick the minimum value of a tensor along the axis=1
-min_dist = dist.min(axis=0)
-# define a tensor with the same dimension as y and fill it with zeros
-centroids = ht.zeros((y.shape[0], y.shape[1]))
-# replace the 0th row vector of "centroids" by a randomly chosen row vector of y
-sample = ht.random.randint(0, y.shape[0] - 1).item()
-centroids[0, :] = y[sample]
-# Useful for degubbing: keep track auf matrix shapes and the process (i.e., the gpu) the data is assigned to
-print(f"centroids.shape{centroids.shape}, process= {ht.MPI_WORLD.rank}\n")
-# stack two vectors together
-# a=ht.array([1,2,3,4])
-# b=ht.array([10,20,30,40])
-# a=ht.array(2)
-# b=ht.array(3)
-# stacked_ab=ht.stack((a,b),axis=0)
-# add dimensions
-a_vector = ht.array([1, 2, 3, 4])
-new_x = ht.expand_dims(a_vector, axis=0)  # output: [[1,2,3,4]]
-# stack two vectors together and flatten, so that the outcome is similar to the command "append"
-a = ht.array([[1, 2, 3, 4], [1, 5, 3, 4], [1, 2, 3, 42]])
-# b=ht.array([[10,20,30,40],[10,20,30,40],[1,2,3,4]])
-# stacked_ab=ht.stack((a,b),axis=0)
-# reshaped_stacked_ab=ht.reshape(stacked_ab,(stacked_ab.shape[0]*stacked_ab.shape[1],stacked_ab.shape[2]))
-b = ht.array([[10, 20, 30, 40], [10, 20, 30, 40]])
-stacked_ab = ht.row_stack((a, b))
-# create random numbers between 0 and 1
-random = ht.random.rand(y.shape[0])
-# translate into a uniform probability distribution
-random_prob = random / random.sum()
-# find the indices for which the condition test1<test holds (is to be understood elementwise)
-test = ht.array([0.3, 0.5, 0.8])
-test1 = ht.array([0.2, 0.6, 0.4])
-find_indices = ht.where(test1 < test)
-# find the largest value in a vector
-some_vector = np.array([1, 2, 4, 4])
-some_vector_max = (
-    some_vector.max()
-)  # when dealing with ht.array one should add an .item() at the end, to ensure that the dndarray or torch tensor is transformed to a scalar
-weights = torch.tensor(np.array([np.sum(some_vector == i) for i in range(0, some_vector.shape[0])]))
-"""     # ensure that all processes have the same data
-if ht.MPI_WORLD.rank == 0:
-    weights=weights
-else:
-# tensor with zeros that has the same size as reclustered centroids, in order to to allocate memory (necessary for broadcast)
-    weights = torch.zeros(
-        (weights.shape[0], weights.shape[1]), dtype=x.dtype.torch_type(), device=centroids.device)
-    ht.MPI_WORLD.Bcast(
-        weights, root=0) """
-
-from batchparallelclustering import _initialize_plus_plus, BatchParallelKMeans
-
-""" X = torch.rand(100, 3)
-W = torch.tensor(w.larray)
-"""
-""" n_clusters=3
-BPK=BatchParallelKMeans(n_clusters) """
-from heat.utils.data.spherical import create_spherical_dataset
-
-""" data = create_spherical_dataset(
-            num_samples_cluster=100, radius=1.0, offset=4.0, dtype=ht.float32, random_state=1
-        )
-data=ht.array(data,split=0) """
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-"""
-def plot_clusters(data, labels, centers, title="Clustering Visualization"):
-    # Visualizes clustered data in 2D or 3D.
-    # Parameters:
-    # - data (numpy.ndarray): Input data of shape (n_samples, n_features).
-    # - labels (numpy.ndarray): Cluster labels for each point (optional).
-    # - centers (numpy.ndarray): Coordinates of cluster centers (optional).
-    # - title (str): Title of the plot.
-    # Determine dimensionality
-    dim = data.shape[1]
-    if dim not in [2, 3]:
-        raise ValueError("Data must be 2D or 3D for plotting.")
-    # Set up plot
-    fig = plt.figure(figsize=(8, 8))
-    if dim == 2:
-        ax = fig.add_subplot(111)
-    else:
-        ax = fig.add_subplot(111, projection="3d")
-    unique_labels = np.unique(labels)
-    # Loop through unique labels (clusters)
-    for i in unique_labels:
-        cluster_data = data[labels == i]  # Get all data points for the current label
-        if dim == 2:
-            ax.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f"Cluster {i}")
-        else:
-            ax.scatter(
-                cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], label=f"Cluster {i}"
-            )
-    # Plot cluster centers if provided
-    if dim == 2:
-        ax.scatter(centers[:, 0], centers[:, 1], c="red", marker="x", s=200, label="Centers")
-    else:
-        ax.scatter(
-            centers[:, 0],
-            centers[:, 1],
-            centers[:, 2],
-            c="red",
-            marker="x",
-            s=200,
-            label="Centers",
-        )
-    # Add labels and legend
-    ax.set_title(title)
-    ax.set_xlabel("Feature 1")
-    ax.set_ylabel("Feature 2")
-    if dim == 3:
-        ax.set_zlabel("Feature 3")
-    ax.legend()
-    plt.savefig("plot.pdf")
-    plt.show() """
-
-
-# Example usage:
-# Assuming you have your data, labels, and centers in numpy arrays
-print("Start plotting \n\n\n")
-data = ht.utils.data.spherical.create_spherical_dataset(
-    num_samples_cluster=20000000, radius=2.0, offset=10.0, dtype=ht.float32, random_state=1
-)
-# data = data[:, :-1]
-
-start_time = time.time()
-kmeans = ht.cluster.KMeans(n_clusters=4, init="kmeans++", max_iter=400)
-# kmeans = ht.cluster.KMedians(n_clusters=5, init="kmedians++", max_iter=400)
-kmeans.fit(data, oversampling=10, iter_multiplier=1)
-end_time = time.time()
-
-# Laufzeit berechnen
-print(f"Runtime for clustering: {end_time - start_time:.4f} Sekunden")
-
-labels = kmeans._labels
-labels = ht.reshape(labels, labels.shape[0])
-centers = kmeans._cluster_centers
-# Convert data, labels, and centers from heat tensors to numpy arrays
-data = data.numpy()
-# data = data.resplit_(None).larray.cpu().numpy()
-labels = labels.resplit_(None).larray.cpu().numpy()
-centers = centers.resplit_(None).larray.cpu().numpy()
-# print("centroids= ", centers)
-# Call the plot function
-# plot_clusters(data, labels, centers)