E4-Computer-Engineering · GlassOfWhiskey · Mar 11, 2025 · Mar 7, 2025 · Mar 20, 2025 · Mar 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -55,3 +55,14 @@ Thumbs.db
 
 # Build directory
 build/
+
+# Virtual Python environment
+.venv/
+
+# Miscellaneous cache
+.cache/
+
+# StreamFlow
+.streamflow
+report.html
+output.txt
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "SimulatedAnnealing"]
+	path = SimulatedAnnealing
+	url = ../SimulatedAnnealing.git
diff --git a/Makefile b/Makefile
@@ -10,8 +10,10 @@ BIN_DIR = $(BUILD_DIR)/bin
 
 LIB = -lmpi -lm
 # Files
-OBJS = $(OBJ_DIR)/points.o $(OBJ_DIR)/kmeans_cl.o $(OBJ_DIR)/kmeans.o $(OBJ_DIR)/dbscan.o $(OBJ_DIR)/clustering.o $(OBJ_DIR)/fastcluster.o $(OBJ_DIR)/hclust.o
-EXE = $(BIN_DIR)/clustering
+CLUS_OBJS = $(OBJ_DIR)/common.o $(OBJ_DIR)/kmeans_cl.o $(OBJ_DIR)/kmeans.o $(OBJ_DIR)/dbscan.o $(OBJ_DIR)/clustering.o $(OBJ_DIR)/fastcluster.o $(OBJ_DIR)/hclust.o
+CLUS_EXE = $(BIN_DIR)/clustering
+SIL_OBJS = $(OBJ_DIR)/common.o $(OBJ_DIR)/silhouette.o
+SIL_EXE = $(BIN_DIR)/silhouette
 
 # Compilation flags
 CFLAGS = -O3 -I$(INC_DIR) -I$(MPI_INC) -L$(MPI_LIB) -I$(KMEANS_LIB_DIR) -I$(DBSCAN_LIB_DIR)
@@ -21,21 +23,24 @@ CXXFLAGS = -O3 -I$(INC_DIR) -I$(MPI_INC) -L$(MPI_LIB) -I$(KMEANS_LIB_DIR) -I$(DB
 $(shell mkdir -p $(OBJ_DIR) $(BIN_DIR))
 
 # Default target
-all: $(EXE)
+all: $(CLUS_EXE) $(SIL_EXE)
 
-# Link executable
-$(EXE): $(OBJS)
+# Link executables
+$(CLUS_EXE): $(CLUS_OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LIB)
+
+$(SIL_EXE): $(SIL_OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LIB)
 
 #Compile source files into build directory
-$(OBJ_DIR)/points.o: $(SRC_DIR)/points.cpp
+$(OBJ_DIR)/common.o: $(SRC_DIR)/common.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 $(OBJ_DIR)/kmeans_cl.o: $(KMEANS_LIB_DIR)/kmeans_cl.c $(KMEANS_LIB_DIR)/kmeans_cl.h
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 $(OBJ_DIR)/kmeans.o: $(KMEANS_LIB_DIR)/kmeans.c
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 $(OBJ_DIR)/dbscan.o: $(DBSCAN_LIB_DIR)/dbscan.cpp $(DBSCAN_LIB_DIR)/dbscan.hpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -49,6 +54,9 @@ $(OBJ_DIR)/hclust.o: $(HIERARCHICAL_LIB_DIR)/hclust.cpp
 $(OBJ_DIR)/clustering.o: $(SRC_DIR)/clustering.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+$(OBJ_DIR)/silhouette.o: $(SRC_DIR)/silhouette.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 # Clean up build files
 clean:
 	rm -rf $(BUILD_DIR)
diff --git a/README.md b/README.md
@@ -3,24 +3,56 @@
 This work is based upon the approach from [A clustering aggregation algorithm on neutral-atoms and annealing quantum processors](https://arxiv.org/pdf/2412.07558).
 
 ## How to run
-Make sure to have a working MPI installation available. Its include path should either be added to $INCLUDE or $MPI_INC.
 
-The code can be compiled using `make`. The newly built executable will be under the build/bin directory.
+Make sure to have a working MPI installation available. Its include path should either be added to `$INCLUDE` or `$MPI_INC`.
+
+The code can be compiled using `make`. Relevant executables will be under the build/bin directory.
+
+### Clustering
+
+The clustering executable can be run as follows:
 
-The code can be run as follows:
 ```bash
 mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv
 ```
+
 You can optionally add another argument to save the output matrix to file:
+
 ```bash
 mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv example_output.txt
 ```
+
 You can add one more optional argument to save the indices of points that comprise each cluster:
+
 ```bash
 mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv example_output.txt cluster_indices.txt
 ```
-### Expected output
-Running the clustering executable will create an overlap matrix in the following form:
+
+You can add another argument (integer) to specify the seed for the PRNG used by each clustering algorithm:
+```bash
+mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv example_output.txt cluster_indices.txt 5
+```
+
+### Silhouette
+
+The Silhouette score computation executable can be run as follows:
+
+```bash
+build/bin/silhouette data/input/cluster_points_article.csv cluster_indices.txt quantum_job_output.txt
+```
+
+You can add another argument if you want to save the score to a specific file, e.g.:
+
+```bash
+build/bin/silhouette data/input/cluster_points_article.csv cluster_indices.txt quantum_job_output.txt s-score.txt
+```
+
+## Expected output
+
+### Clustering
+
+Running the `clustering` executable will create an overlap matrix in the following form:
+
 ```
 -1 8 8 8 0 0 0 0
 0 -1 0 8 0 0 0 0
@@ -31,33 +63,79 @@ Running the clustering executable will create an overlap matrix in the following
 0 0 0 0 0 0 -1 8
 0 0 0 0 0 0 0 -1
 ```
+
 Each column/row represent a possible cluster. The diagonal terms are equal to -1, the off-diagonal ones are either 0 or a positive integer $\lambda$. Positive values denote overlaps between clusters. The value of $\lambda$ is defined as the number of different clusters, in this case 8, in order to prevent the selection of overlapping clusters.
 
 If you choose to also save the points of each cluster, they will be in this form:
+
 ```
 0,1,3,4
 2,5,7
 6,8,9
 ```
+
 Each line corresponds to a different cluster. Each of its comma-separated values corresponds to a point from the original input file.
+
+### Silhouette
+
+Running the `silhouette` executable will compute and output the Silhouette score of a given clustering as a decimal number between -1 and 1. 
+
+## Workflow run
+
+It is also possible to run the whole Classical-Quantum pipeline (clustering + simulated annealing) as a workflow using the [StreamFlow](https://streamflow.di.unito.it) WMS. To do that, you need to clone this repository and all the included submodules, as follows:
+
+```bash
+git clone --recurse-submodules git@github.com:E4-Computer-Engineering/clustering-mis.git
+```
+
+The StreamFlow WMS requires Python 3.9 or newer. It can easily be installed as a Python package using the following commands:
+
+```bash
+python -m venv venv
+source venv/bin/activate
+pip install streamflow[report]==0.2.0.dev12
+```
+
+The workflow configuration is expressed in a declarative `streamflow.yml` file. An [example](workflow/streamflow.yml) targeting the [CINECA@Leonardo](https://leonardo-supercomputer.cineca.eu/) HPC facility is included in this repository. Modify it by adding your credentials (`username` and `sshKey`) and a path to a working directory in a shared portion of the Leonardo filesystem (e.g., in your `$HOME` folder).
+
+At this point, simply run the workflow using this command:
+
+```bash
+streamflow run --name smart-hpc-qc workflow/streamflow.yml
+```
+
+When the workflow completes succesfully, you should find an `output.txt` file containing the results of the simulated annealing phase. In addition, the following command generates a report of the workflow run:
+
+```bash
+streamflow report --file workflow/streamflow.yml smart-hpc-qc
+```
+
 ## TODO
+
 - [ ] Add brief description with images
 
 ## Suggested dev setup
+
 It recommended to use [VS Code](https://code.visualstudio.com/).
+
 ### Linting and autocompletion
+
 IntelliSense from the Microsoft-provided C++ and Makefile extensions reports errors even if the code compiles.
 It is recommended to use the [clangd extension](https://marketplace.visualstudio.com/items?itemName=llvm-vs-code-extensions.vscode-clangd) instead.
 
 Install the clangd extension and allow it to disable IntelliSense. Install the clangd language server if prompted.
 
 Then install [bear](https://github.com/rizsotto/Bear) and, from the project root directory, run the following:
+
 ```bash
 make clean; bear -- make
 ```
+
 This will create a `compile_commands.json` file that is used by clangd to correctly inspect code.
 Run "clangd: Restart language server" from the Command Palette (Ctrl+Shift+P) to read the newly created file.
 
-You need to execute again the commands from above and restart the language server after each Makefile change 
+You need to execute again the commands from above and restart the language server after each Makefile change
+
 ### Formatting
+
 The clangd extension from the previous section can also format C/C++ code. Invoke it from Command Palette -> Format Document.
diff --git a/SimulatedAnnealing b/SimulatedAnnealing
diff --git a/clustering.cpp b/clustering.cpp
@@ -1,20 +1,23 @@
 #include <cmath>
 #include <cstdlib>
+#include <functional>
 #include <mpi.h>
 #include <numeric>
 #include <ostream>
 #include <set>
 #include <span>
 #include <stdlib.h>
 
-#include "points.h"
+#include "common.h"
 #include <algorithm>
 #include <fstream>
 #include <iostream>
-#include <sstream>
 #include <string>
 #include <vector>
 
+using ClusFuncType =
+    std::function<int(int, const char *, const point *, int, int *, int)>;
+
 // Short-circuited sets intersection
 bool have_shared_elem(const std::set<int> &x, const std::set<int> &y) {
     auto i = x.begin();
@@ -30,13 +33,23 @@ bool have_shared_elem(const std::set<int> &x, const std::set<int> &y) {
     return false;
 }
 
-std::vector<std::vector<int>>
+std::vector<std::vector<double>>
 create_overlap_matrix(const std::vector<std::set<int>> &clusters) {
+    // We want to give a different weight to each diagonal term. Otherwise, if
+    // all clusters are considered equally good, the annealer will choose a
+    // solution with many small clusters.
+    std::vector<size_t> sizes(clusters.size());
+    std::transform(clusters.begin(), clusters.end(), sizes.begin(),
+                   [](const auto &cl) { return cl.size(); });
+    // The biggest cluster will have a weight of 1. The others will be
+    // normalized to be smaller, but in the range 0 < x < 1.
+    auto max_size = *std::max_element(sizes.begin(), sizes.end());
+
     auto n = clusters.size();
     auto penalty = n;
 
     // Initialize empty matrix
-    std::vector<std::vector<int>> res(n, std::vector<int>(n, 0));
+    std::vector<std::vector<double>> res(n, std::vector<double>(n, 0.0));
 
     // Add penalty to overlapping clusters
     for (auto i = 0; i < n - 1; i++) {
@@ -49,13 +62,13 @@ create_overlap_matrix(const std::vector<std::set<int>> &clusters) {
 
     // Set diagonal terms
     for (auto i = 0; i < n; i++) {
-        res[i][i] = -1;
+        res[i][i] = -(double)sizes[i] / max_size;
     }
     return res;
 }
 
 void write_matrix(std::ostream &out_stream,
-                  const std::vector<std::vector<int>> &m) {
+                  const std::vector<std::vector<double>> &m) {
     for (auto i : m) {
         for (auto j = i.begin(); j != i.end(); j++) {
             if (j != i.begin()) {
@@ -67,12 +80,12 @@ void write_matrix(std::ostream &out_stream,
     }
 }
 
-void print_matrix(const std::vector<std::vector<int>> &m) {
+void print_matrix(const std::vector<std::vector<double>> &m) {
     write_matrix(std::cout, m);
 }
 
 int save_matrix(const std::string &file_name,
-                const std::vector<std::vector<int>> &m) {
+                const std::vector<std::vector<double>> &m) {
     std::ofstream file(file_name);
     if (!file) {
         std::cerr << "Error opening file." << std::endl;
@@ -103,29 +116,12 @@ int save_clusters(const std::string &file_name,
     }
 }
 
-void read_points(std::istream &file, std::vector<point> &points) {
-    std::string line;
-    std::getline(file, line); // Skip first line
-
-    while (std::getline(file, line)) {
-        std::istringstream ss(line);
-        point p;
-        char comma;
-
-        // Read x and y, assuming CSV format
-        if (ss >> p.x >> comma >> p.y) {
-            points.emplace_back(p);
-        }
-    }
-}
-
 // Return total number of clusters identified by each algorithm
 int run_clustering_algorithms(int my_rank, int num_methods_proc,
                               const std::vector<point> &pts, int num_methods,
                               const std::vector<std::string> &methods,
-                              int (*functions[])(int, const char *,
-                                                 const point *, int, int *),
-                              std::vector<int> &assigned_clusters) {
+                              std::vector<ClusFuncType> &functions,
+                              std::vector<int> &assigned_clusters, int seed) {
     auto num_points = pts.size();
 
     // Each process can run more than one clustering algorithm.
@@ -149,7 +145,7 @@ int run_clustering_algorithms(int my_rank, int num_methods_proc,
 
         auto res = functions[global_method_idx](
             my_rank, current_method_name.c_str(), pts.data(), num_points,
-            assigned_clusters.data() + local_method_idx * num_points);
+            assigned_clusters.data() + local_method_idx * num_points, seed);
 
         auto begin = assigned_clusters.begin() + local_method_idx * num_points;
         auto end = begin + num_points;
@@ -175,11 +171,9 @@ int main(int argc, char **argv) {
     int my_rank, num_processes;
 
     std::vector<std::string> methods = {"kmeans", "dbscan", "hclust"};
+    std::vector<ClusFuncType> functions = {kmeans, dbscan, hclust};
     int num_methods = methods.size();
 
-    int (*functions[])(int, const char *, const point *, int,
-                       int *) = {kmeans, dbscan, hclust};
-
     MPI_Init(&argc, &argv);
     MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
@@ -202,7 +196,7 @@ int main(int argc, char **argv) {
             MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
         }
 
-        read_points(file, pts);
+        pts = read_points(file);
         num_points = pts.size();
     }
 
@@ -219,11 +213,13 @@ int main(int argc, char **argv) {
     auto num_methods_proc =
         std::ceil((float)num_methods / (float)num_processes);
 
+    int seed = argc > 4 ? std::stoi(argv[4]) : 0;
+
     // Run the algorithms assigned to this process and flatten their results
     std::vector<int> assigned_clusters(num_methods_proc * num_points, 0);
     int ncl =
         run_clustering_algorithms(my_rank, num_methods_proc, pts, num_methods,
-                                  methods, functions, assigned_clusters);
+                                  methods, functions, assigned_clusters, seed);
 
     std::vector<int> all_res;        // Aggregation of all clustering results
                                      // across all processes