Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
4b378cb
Added e2e workflow
GlassOfWhiskey Mar 11, 2025
4513ec0
Use C++ vector instead of C array
bebora Mar 7, 2025
8642464
Add weights to diagonal terms
bebora Mar 20, 2025
99bbdff
Use annealer with floating point support
bebora Mar 24, 2025
69a0e4c
Compute Silhouette score from a separate executable
bebora Mar 24, 2025
e712242
Added score-based loop
GlassOfWhiskey Mar 28, 2025
a91c87f
Use pseudo-random parameters for clustering algorithms
bebora Mar 25, 2025
e5f2bb4
Added E4 scripts
GlassOfWhiskey Mar 31, 2025
f5e0c57
Fix overflow when using larger datasets (more than 2^16 points)
bebora Apr 8, 2025
33bd6dc
Handle edge cases in Silhouette score computation
bebora Apr 8, 2025
bba227b
Allow https url for SimulatedAnnealing submodule
bebora Apr 8, 2025
f30d92c
Generate medium size dataset
bebora Apr 9, 2025
87bfbe9
Update workflow inputs
bebora Apr 9, 2025
ebfacfb
Update default threshold
bebora Apr 10, 2025
2a58727
Launch and measure serial jobs using a bash script
bebora Apr 10, 2025
cd05777
Compute aggregate metrics for workflow approach
bebora Apr 11, 2025
eb7346b
Profile single workflow with annealing sleep
bebora Apr 12, 2025
848c475
Add option to run workflow with already compiled executables
bebora Apr 12, 2025
c7a354e
Coarse metrics for dual workflow jobs
bebora Apr 12, 2025
6d60e7d
Launch dual workflow runs
bebora Apr 12, 2025
6e5f265
Compute finer workflow metrics
bebora Apr 13, 2025
6e869fd
Update .gitignore
bebora Apr 13, 2025
ca7803d
Reduce resource requirements for compilation and Silhouette score com…
bebora Jul 13, 2025
4dca67d
Equalise compilation and execution behaviour between E4 and CINECA cl…
bebora Jul 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,14 @@ Thumbs.db

# Build directory
build/

# Virtual Python environment
.venv/

# Miscellaneous cache
.cache/

# StreamFlow
.streamflow
report.html
output.txt
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "SimulatedAnnealing"]
path = SimulatedAnnealing
url = ../SimulatedAnnealing.git
24 changes: 16 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ BIN_DIR = $(BUILD_DIR)/bin

LIB = -lmpi -lm
# Files
OBJS = $(OBJ_DIR)/points.o $(OBJ_DIR)/kmeans_cl.o $(OBJ_DIR)/kmeans.o $(OBJ_DIR)/dbscan.o $(OBJ_DIR)/clustering.o $(OBJ_DIR)/fastcluster.o $(OBJ_DIR)/hclust.o
EXE = $(BIN_DIR)/clustering
CLUS_OBJS = $(OBJ_DIR)/common.o $(OBJ_DIR)/kmeans_cl.o $(OBJ_DIR)/kmeans.o $(OBJ_DIR)/dbscan.o $(OBJ_DIR)/clustering.o $(OBJ_DIR)/fastcluster.o $(OBJ_DIR)/hclust.o
CLUS_EXE = $(BIN_DIR)/clustering
SIL_OBJS = $(OBJ_DIR)/common.o $(OBJ_DIR)/silhouette.o
SIL_EXE = $(BIN_DIR)/silhouette

# Compilation flags
CFLAGS = -O3 -I$(INC_DIR) -I$(MPI_INC) -L$(MPI_LIB) -I$(KMEANS_LIB_DIR) -I$(DBSCAN_LIB_DIR)
Expand All @@ -21,21 +23,24 @@ CXXFLAGS = -O3 -I$(INC_DIR) -I$(MPI_INC) -L$(MPI_LIB) -I$(KMEANS_LIB_DIR) -I$(DB
$(shell mkdir -p $(OBJ_DIR) $(BIN_DIR))

# Default target
all: $(EXE)
all: $(CLUS_EXE) $(SIL_EXE)

# Link executable
$(EXE): $(OBJS)
# Link executables
$(CLUS_EXE): $(CLUS_OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LIB)

$(SIL_EXE): $(SIL_OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LIB)

#Compile source files into build directory
$(OBJ_DIR)/points.o: $(SRC_DIR)/points.cpp
$(OBJ_DIR)/common.o: $(SRC_DIR)/common.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@

$(OBJ_DIR)/kmeans_cl.o: $(KMEANS_LIB_DIR)/kmeans_cl.c $(KMEANS_LIB_DIR)/kmeans_cl.h
$(CC) $(CFLAGS) -c $< -o $@
$(CXX) $(CXXFLAGS) -c $< -o $@

$(OBJ_DIR)/kmeans.o: $(KMEANS_LIB_DIR)/kmeans.c
$(CC) $(CFLAGS) -c $< -o $@
$(CXX) $(CXXFLAGS) -c $< -o $@

$(OBJ_DIR)/dbscan.o: $(DBSCAN_LIB_DIR)/dbscan.cpp $(DBSCAN_LIB_DIR)/dbscan.hpp
$(CXX) $(CXXFLAGS) -c $< -o $@
Expand All @@ -49,6 +54,9 @@ $(OBJ_DIR)/hclust.o: $(HIERARCHICAL_LIB_DIR)/hclust.cpp
$(OBJ_DIR)/clustering.o: $(SRC_DIR)/clustering.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@

$(OBJ_DIR)/silhouette.o: $(SRC_DIR)/silhouette.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@

# Clean up build files
clean:
rm -rf $(BUILD_DIR)
90 changes: 84 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,56 @@
This work is based upon the approach from [A clustering aggregation algorithm on neutral-atoms and annealing quantum processors](https://arxiv.org/pdf/2412.07558).

## How to run
Make sure to have a working MPI installation available. Its include path should either be added to $INCLUDE or $MPI_INC.

The code can be compiled using `make`. The newly built executable will be under the build/bin directory.
Make sure to have a working MPI installation available. Its include path should either be added to `$INCLUDE` or `$MPI_INC`.

The code can be compiled using `make`. Relevant executables will be under the build/bin directory.

### Clustering

The clustering executable can be run as follows:

The code can be run as follows:
```bash
mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv
```

You can optionally add another argument to save the output matrix to file:

```bash
mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv example_output.txt
```

You can add one more optional argument to save the indices of points that comprise each cluster:

```bash
mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv example_output.txt cluster_indices.txt
```
### Expected output
Running the clustering executable will create an overlap matrix in the following form:

You can add another argument (integer) to specify the seed for the PRNG used by each clustering algorithm:
```bash
mpirun -n 3 build/bin/clustering data/input/cluster_points_article.csv example_output.txt cluster_indices.txt 5
```

### Silhouette

The Silhouette score computation executable can be run as follows:

```bash
build/bin/silhouette data/input/cluster_points_article.csv cluster_indices.txt quantum_job_output.txt
```

You can add another argument if you want to save the score to a specific file, e.g.:

```bash
build/bin/silhouette data/input/cluster_points_article.csv cluster_indices.txt quantum_job_output.txt s-score.txt
```

## Expected output

### Clustering

Running the `clustering` executable will create an overlap matrix in the following form:

```
-1 8 8 8 0 0 0 0
0 -1 0 8 0 0 0 0
Expand All @@ -31,33 +63,79 @@ Running the clustering executable will create an overlap matrix in the following
0 0 0 0 0 0 -1 8
0 0 0 0 0 0 0 -1
```

Each column/row represent a possible cluster. The diagonal terms are equal to -1, the off-diagonal ones are either 0 or a positive integer $\lambda$. Positive values denote overlaps between clusters. The value of $\lambda$ is defined as the number of different clusters, in this case 8, in order to prevent the selection of overlapping clusters.

If you choose to also save the points of each cluster, they will be in this form:

```
0,1,3,4
2,5,7
6,8,9
```

Each line corresponds to a different cluster. Each of its comma-separated values corresponds to a point from the original input file.

### Silhouette

Running the `silhouette` executable will compute and output the Silhouette score of a given clustering as a decimal number between -1 and 1.

## Workflow run

It is also possible to run the whole Classical-Quantum pipeline (clustering + simulated annealing) as a workflow using the [StreamFlow](https://streamflow.di.unito.it) WMS. To do that, you need to clone this repository and all the included submodules, as follows:

```bash
git clone --recurse-submodules git@github.com:E4-Computer-Engineering/clustering-mis.git
```

The StreamFlow WMS requires Python 3.9 or newer. It can easily be installed as a Python package using the following commands:

```bash
python -m venv venv
source venv/bin/activate
pip install streamflow[report]==0.2.0.dev12
```

The workflow configuration is expressed in a declarative `streamflow.yml` file. An [example](workflow/streamflow.yml) targeting the [CINECA@Leonardo](https://leonardo-supercomputer.cineca.eu/) HPC facility is included in this repository. Modify it by adding your credentials (`username` and `sshKey`) and a path to a working directory in a shared portion of the Leonardo filesystem (e.g., in your `$HOME` folder).

At this point, simply run the workflow using this command:

```bash
streamflow run --name smart-hpc-qc workflow/streamflow.yml
```

When the workflow completes succesfully, you should find an `output.txt` file containing the results of the simulated annealing phase. In addition, the following command generates a report of the workflow run:

```bash
streamflow report --file workflow/streamflow.yml smart-hpc-qc
```

## TODO

- [ ] Add brief description with images

## Suggested dev setup

It recommended to use [VS Code](https://code.visualstudio.com/).

### Linting and autocompletion

IntelliSense from the Microsoft-provided C++ and Makefile extensions reports errors even if the code compiles.
It is recommended to use the [clangd extension](https://marketplace.visualstudio.com/items?itemName=llvm-vs-code-extensions.vscode-clangd) instead.

Install the clangd extension and allow it to disable IntelliSense. Install the clangd language server if prompted.

Then install [bear](https://github.com/rizsotto/Bear) and, from the project root directory, run the following:

```bash
make clean; bear -- make
```

This will create a `compile_commands.json` file that is used by clangd to correctly inspect code.
Run "clangd: Restart language server" from the Command Palette (Ctrl+Shift+P) to read the newly created file.

You need to execute again the commands from above and restart the language server after each Makefile change
You need to execute again the commands from above and restart the language server after each Makefile change

### Formatting

The clangd extension from the previous section can also format C/C++ code. Invoke it from Command Palette -> Format Document.
1 change: 1 addition & 0 deletions SimulatedAnnealing
Submodule SimulatedAnnealing added at 7a1528
62 changes: 29 additions & 33 deletions clustering.cpp
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
#include <cmath>
#include <cstdlib>
#include <functional>
#include <mpi.h>
#include <numeric>
#include <ostream>
#include <set>
#include <span>
#include <stdlib.h>

#include "points.h"
#include "common.h"
#include <algorithm>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

using ClusFuncType =
std::function<int(int, const char *, const point *, int, int *, int)>;

// Short-circuited sets intersection
bool have_shared_elem(const std::set<int> &x, const std::set<int> &y) {
auto i = x.begin();
Expand All @@ -30,13 +33,23 @@ bool have_shared_elem(const std::set<int> &x, const std::set<int> &y) {
return false;
}

std::vector<std::vector<int>>
std::vector<std::vector<double>>
create_overlap_matrix(const std::vector<std::set<int>> &clusters) {
// We want to give a different weight to each diagonal term. Otherwise, if
// all clusters are considered equally good, the annealer will choose a
// solution with many small clusters.
std::vector<size_t> sizes(clusters.size());
std::transform(clusters.begin(), clusters.end(), sizes.begin(),
[](const auto &cl) { return cl.size(); });
// The biggest cluster will have a weight of 1. The others will be
// normalized to be smaller, but in the range 0 < x < 1.
auto max_size = *std::max_element(sizes.begin(), sizes.end());

auto n = clusters.size();
auto penalty = n;

// Initialize empty matrix
std::vector<std::vector<int>> res(n, std::vector<int>(n, 0));
std::vector<std::vector<double>> res(n, std::vector<double>(n, 0.0));

// Add penalty to overlapping clusters
for (auto i = 0; i < n - 1; i++) {
Expand All @@ -49,13 +62,13 @@ create_overlap_matrix(const std::vector<std::set<int>> &clusters) {

// Set diagonal terms
for (auto i = 0; i < n; i++) {
res[i][i] = -1;
res[i][i] = -(double)sizes[i] / max_size;
}
return res;
}

void write_matrix(std::ostream &out_stream,
const std::vector<std::vector<int>> &m) {
const std::vector<std::vector<double>> &m) {
for (auto i : m) {
for (auto j = i.begin(); j != i.end(); j++) {
if (j != i.begin()) {
Expand All @@ -67,12 +80,12 @@ void write_matrix(std::ostream &out_stream,
}
}

void print_matrix(const std::vector<std::vector<int>> &m) {
void print_matrix(const std::vector<std::vector<double>> &m) {
write_matrix(std::cout, m);
}

int save_matrix(const std::string &file_name,
const std::vector<std::vector<int>> &m) {
const std::vector<std::vector<double>> &m) {
std::ofstream file(file_name);
if (!file) {
std::cerr << "Error opening file." << std::endl;
Expand Down Expand Up @@ -103,29 +116,12 @@ int save_clusters(const std::string &file_name,
}
}

void read_points(std::istream &file, std::vector<point> &points) {
std::string line;
std::getline(file, line); // Skip first line

while (std::getline(file, line)) {
std::istringstream ss(line);
point p;
char comma;

// Read x and y, assuming CSV format
if (ss >> p.x >> comma >> p.y) {
points.emplace_back(p);
}
}
}

// Return total number of clusters identified by each algorithm
int run_clustering_algorithms(int my_rank, int num_methods_proc,
const std::vector<point> &pts, int num_methods,
const std::vector<std::string> &methods,
int (*functions[])(int, const char *,
const point *, int, int *),
std::vector<int> &assigned_clusters) {
std::vector<ClusFuncType> &functions,
std::vector<int> &assigned_clusters, int seed) {
auto num_points = pts.size();

// Each process can run more than one clustering algorithm.
Expand All @@ -149,7 +145,7 @@ int run_clustering_algorithms(int my_rank, int num_methods_proc,

auto res = functions[global_method_idx](
my_rank, current_method_name.c_str(), pts.data(), num_points,
assigned_clusters.data() + local_method_idx * num_points);
assigned_clusters.data() + local_method_idx * num_points, seed);

auto begin = assigned_clusters.begin() + local_method_idx * num_points;
auto end = begin + num_points;
Expand All @@ -175,11 +171,9 @@ int main(int argc, char **argv) {
int my_rank, num_processes;

std::vector<std::string> methods = {"kmeans", "dbscan", "hclust"};
std::vector<ClusFuncType> functions = {kmeans, dbscan, hclust};
int num_methods = methods.size();

int (*functions[])(int, const char *, const point *, int,
int *) = {kmeans, dbscan, hclust};

MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
Expand All @@ -202,7 +196,7 @@ int main(int argc, char **argv) {
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}

read_points(file, pts);
pts = read_points(file);
num_points = pts.size();
}

Expand All @@ -219,11 +213,13 @@ int main(int argc, char **argv) {
auto num_methods_proc =
std::ceil((float)num_methods / (float)num_processes);

int seed = argc > 4 ? std::stoi(argv[4]) : 0;

// Run the algorithms assigned to this process and flatten their results
std::vector<int> assigned_clusters(num_methods_proc * num_points, 0);
int ncl =
run_clustering_algorithms(my_rank, num_methods_proc, pts, num_methods,
methods, functions, assigned_clusters);
methods, functions, assigned_clusters, seed);

std::vector<int> all_res; // Aggregation of all clustering results
// across all processes
Expand Down
Loading