Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
462 changes: 385 additions & 77 deletions .github/workflows/validate-samples.yml

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions ai/select-algorithm-dotnet/src/CompareAll.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@ public static void Run()
{
var database = mongoClient.GetDatabase(databaseName);

// Drop collection for a clean comparison
database.DropCollection("hotels");
Console.WriteLine("Dropped existing 'hotels' collection (if any)");
// Drop collection if it already exists (clean start)
var collectionNames = database.ListCollectionNames().ToList();
if (collectionNames.Contains("hotels"))
{
database.DropCollection("hotels");
Console.WriteLine("Dropped existing 'hotels' collection.");
}

var collection = database.GetCollection<BsonDocument>("hotels");

Expand Down
15 changes: 9 additions & 6 deletions ai/select-algorithm-go/src/compare_all.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,18 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client,
fmt.Printf("Top-K: %d\n", topK)
fmt.Printf("Verbose: %v\n", verbose)

// 1. Drop collection for clean comparison, then load data
// 1. Drop collection if it exists for clean comparison, then load data
database := dbClient.Database(config.DatabaseName)
collection := database.Collection("hotels")

// Drop existing collection for a clean comparison
if err := collection.Drop(ctx); err != nil {
fmt.Printf("Note: could not drop collection (may not exist): %v\n", err)
} else {
fmt.Println("Dropped existing 'hotels' collection")
// Drop existing collection if it exists (clean start)
names, _ := database.ListCollectionNames(ctx, bson.M{"name": "hotels"})
if len(names) > 0 {
if err := collection.Drop(ctx); err != nil {
fmt.Printf("Note: could not drop collection: %v\n", err)
} else {
fmt.Println("Dropped existing 'hotels' collection")
}
}

// Ensure cleanup on exit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,80 +49,85 @@ public static void run() {
MongoDatabase database = mongoClient.getDatabase(databaseName);
MongoCollection<Document> collection = database.getCollection(COLLECTION_NAME);

// Load data ONCE into the single collection
System.out.println(" Loading data from: " + dataFile);
List<Document> data = Utils.readJsonFile(dataFile);
System.out.printf(" Loaded %d documents%n", data.size());

collection.drop();
System.out.println(" Collection reset.");
Utils.insertData(collection, data, 100);

// Generate ONE embedding for the query (reused for all 9 searches)
OpenAIClient aiClient = Utils.getOpenAIClient();
System.out.printf("%n Generating embedding for: \"%s\"%n", queryText);
List<Float> queryVector = Utils.getEmbedding(aiClient, queryText, model);
System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size());

// Convert to doubles for BSON
List<Double> vectorAsDoubles = queryVector.stream()
.map(Float::doubleValue)
.toList();

// Create all 9 indexes idempotently
System.out.println(" Creating 9 vector indexes...");
for (String algo : ALGORITHMS) {
for (String metric : METRICS) {
createIndex(collection, vectorField, dimensions, algo, metric);
try {
// Load data ONCE into the single collection
System.out.println(" Loading data from: " + dataFile);
List<Document> data = Utils.readJsonFile(dataFile);
System.out.printf(" Loaded %d documents%n", data.size());

// Drop collection if it already exists (clean start)
if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) {
collection.drop();
System.out.println(" Dropped existing collection.");
}
}
System.out.println(" All indexes created.\n");

// Run searches sequentially for fair timing
System.out.println(" Running searches...");
for (String algo : ALGORITHMS) {
for (String metric : METRICS) {
String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase());

long startNs = System.nanoTime();
List<Document> searchResults = performSearch(
collection, vectorAsDoubles, vectorField, topK);
long elapsedNs = System.nanoTime() - startNs;
double elapsedMs = elapsedNs / 1_000_000.0;

// Extract top result info
String topHotel = "-";
double topScore = 0.0;
if (!searchResults.isEmpty()) {
Document top = searchResults.get(0);
topHotel = top.getString("HotelName") != null
? top.getString("HotelName") : "-";
topScore = top.getDouble("score") != null
? top.getDouble("score") : 0.0;
Utils.insertData(collection, data, 100);

// Generate ONE embedding for the query (reused for all 9 searches)
OpenAIClient aiClient = Utils.getOpenAIClient();
System.out.printf("%n Generating embedding for: \"%s\"%n", queryText);
List<Float> queryVector = Utils.getEmbedding(aiClient, queryText, model);
System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size());

// Convert to doubles for BSON
List<Double> vectorAsDoubles = queryVector.stream()
.map(Float::doubleValue)
.toList();

// Create all 9 indexes idempotently
System.out.println(" Creating 9 vector indexes...");
for (String algo : ALGORITHMS) {
for (String metric : METRICS) {
createIndex(collection, vectorField, dimensions, algo, metric);
}
}
System.out.println(" All indexes created.\n");

// Run searches sequentially for fair timing
System.out.println(" Running searches...");
for (String algo : ALGORITHMS) {
for (String metric : METRICS) {
String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase());

long startNs = System.nanoTime();
List<Document> searchResults = performSearch(
collection, vectorAsDoubles, vectorField, topK);
long elapsedNs = System.nanoTime() - startNs;
double elapsedMs = elapsedNs / 1_000_000.0;

// Extract top result info
String topHotel = "-";
double topScore = 0.0;
if (!searchResults.isEmpty()) {
Document top = searchResults.get(0);
topHotel = top.getString("HotelName") != null
? top.getString("HotelName") : "-";
topScore = top.getDouble("score") != null
? top.getDouble("score") : 0.0;
}

results.add(new SearchResult(
algo.toUpperCase(), metric, indexName,
elapsedMs, searchResults.size(), topHotel, topScore));

if (verbose) {
System.out.printf(" [%s] %d results in %.2f ms%n",
indexName, searchResults.size(), elapsedMs);
for (int i = 0; i < searchResults.size(); i++) {
Document doc = searchResults.get(i);
System.out.printf(" %d. %s (%.4f)%n",
i + 1,
doc.getString("HotelName"),
doc.getDouble("score"));
results.add(new SearchResult(
algo.toUpperCase(), metric, indexName,
elapsedMs, searchResults.size(), topHotel, topScore));

if (verbose) {
System.out.printf(" [%s] %d results in %.2f ms%n",
indexName, searchResults.size(), elapsedMs);
for (int i = 0; i < searchResults.size(); i++) {
Document doc = searchResults.get(i);
System.out.printf(" %d. %s (%.4f)%n",
i + 1,
doc.getString("HotelName"),
doc.getDouble("score"));
}
}
}
}
} finally {
// Cleanup: always drop the comparison collection
System.out.println("\n Cleanup: dropping comparison collection...");
collection.drop();
System.out.println(" Cleanup: dropped collection 'hotels'");
}

// Cleanup: drop the comparison collection
System.out.println("\n Cleanup: dropping comparison collection...");
collection.drop();
System.out.println(" Cleanup: dropped collection 'hotels'");
}

// Print comparison table
Expand Down
7 changes: 4 additions & 3 deletions ai/select-algorithm-python/src/compare_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,10 @@ def main():
try:
database = mongo_client[config["database_name"]]

# Drop collection for a clean comparison
database.drop_collection("hotels")
print("Dropped existing 'hotels' collection (if any)")
# Drop collection if it already exists (clean start)
if "hotels" in database.list_collection_names():
database.drop_collection("hotels")
print("Dropped existing 'hotels' collection")

# Create fresh collection and load data
collection = database["hotels"]
Expand Down
29 changes: 25 additions & 4 deletions ai/select-algorithm-typescript/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,21 +75,42 @@ npm run start:diskann

## Compare All Algorithms

Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table:
Run all 9 combinations (3 algorithms × 3 similarity metrics) across multiple diverse queries and view formatted comparison tables with a ranking divergence summary:

```bash
npm run start:compare-all
```

By default, the script runs **5 diverse queries** designed to stress different aspects of similarity ranking:

1. `outdoor adventure with family activities`
2. `quiet romantic getaway with ocean view`
3. `budget-friendly downtown hotel with free WiFi`
4. `historic building with fine dining and spa`
5. `ski resort with yoga and winter sports`

**Environment variables** (optional overrides):

| Variable | Default | Description |
|---|---|---|
| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text |
| `TOP_K` | `3` | Number of results per combination |
| `QUERY_TEXT` | *(5 built-in queries)* | Override with a single custom query |
| `TOP_K` | `5` | Number of results per combination |
| `VERBOSE` | `false` | When `true`, shows all k results per combo |

The script creates a single `hotels` collection, loads data once, creates 9 vector indexes (one per algorithm/metric pair), and runs searches sequentially for fair timing comparison.
### Architecture

> **DocumentDB limitation:** Only ONE vector index per field per collection is allowed. The script creates 9 separate collections (one per algorithm×metric pair), loads data into each, creates one index per collection, runs searches, and cleans up all collections on exit.

### Output

The script produces:
- **Per-query comparison table** — shows algorithm, metric, latency, top score, and #1 result for each of the 9 combinations
- **Ranking divergence summary** — highlights queries where algorithms/metrics disagreed on the #1 result
- **Score gap analysis** — shows the confidence margin between #1 and #2 results

### Small dataset caveat

With ~50 hotel documents, all algorithms typically return identical rankings. This is expected — the dataset is too small for algorithmic differences to surface. For meaningful differentiation, use 1000+ documents with varied embeddings. The diverse queries help by combining attributes that no single hotel perfectly satisfies, which can reveal metric-level differences (COS vs L2 vs IP) even on small data.

## Algorithm comparison

Expand Down
Loading
Loading