Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ef831ca
[ai] feat: add a workflow to parse new datasets from HF
tellet-q May 4, 2026
ed113c8
fix: select proper column
tellet-q May 5, 2026
e1b9038
fix: use boto3
tellet-q May 5, 2026
c566971
fix: use boto3
tellet-q May 5, 2026
1cde3c1
fix: update dims
tellet-q May 5, 2026
9f7b1eb
fix: fix ssh
tellet-q May 5, 2026
1436b4b
update datasets
tellet-q May 5, 2026
3fb048b
use bigger server
tellet-q May 5, 2026
30fa22b
use bigger server
tellet-q May 5, 2026
8a8d888
turbo quant rest experiments
IvanPleshkov Apr 2, 2026
f1a792c
cascade tq experiments
IvanPleshkov Apr 5, 2026
d6a1883
one more test
IvanPleshkov Apr 5, 2026
68768d2
Fix config name and jupyter notebook
tellet-q Apr 7, 2026
17f29cc
no rescoring
IvanPleshkov Apr 7, 2026
cbda568
Set log_lvl to error
tellet-q Apr 7, 2026
ed187f7
qdrant_hybrid
IvanPleshkov Apr 7, 2026
70c2d82
Run server and client on the same machine
tellet-q Apr 8, 2026
24a9b6d
6bit experiments
IvanPleshkov Apr 8, 2026
136f31c
more search params
IvanPleshkov Apr 8, 2026
6b494d6
Change serach and add another viz
tellet-q Apr 8, 2026
ab65bd6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 8, 2026
954a13e
Add script to merge final results into one
tellet-q Apr 8, 2026
4811157
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 8, 2026
2e05af8
params to start
IvanPleshkov Apr 8, 2026
a8a8399
Add custom config
tellet-q Apr 8, 2026
b9d96ec
Allow 20 machines at once
tellet-q Apr 8, 2026
79b0d8e
Freeze dev version and fix engine name
tellet-q Apr 9, 2026
b99f9de
Add tq6 cascade set
tellet-q Apr 9, 2026
bfe81cd
Remove tq6 from turbo_quant
tellet-q Apr 9, 2026
0339900
line for orig vectors
IvanPleshkov Apr 9, 2026
c865c8c
Enable repeats and clean results
tellet-q Apr 9, 2026
3bca718
Use 12 machines
tellet-q Apr 9, 2026
edeb2b3
Fix cleanup
tellet-q Apr 9, 2026
c7e1372
less tests, fix line
IvanPleshkov Apr 10, 2026
5818c29
Shuffle
tellet-q Apr 10, 2026
45355be
Reduce number of machines
tellet-q Apr 10, 2026
fe80322
Fix jupyter results processing logic
tellet-q Apr 30, 2026
ebc9338
Fix jupyter results processing logic
tellet-q Apr 30, 2026
aa52035
new api
IvanPleshkov May 3, 2026
4c08647
update tq branch
IvanPleshkov May 3, 2026
b1be4f1
Drop sha256 digest from qdrant_version to fix CI artifact name
IvanPleshkov May 3, 2026
8a4523e
final test attempt
IvanPleshkov May 5, 2026
47e67ab
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 63 additions & 6 deletions .github/workflows/build-hf-dataset.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,81 @@
name: Build HF Dataset

# Stub on master.
# Spins up a single Hetzner machine, downloads a HuggingFace embeddings dataset,
# splits train/test, computes brute-force kNN ground truth, packages
# vectors.npy + tests.jsonl into a tarball, and uploads it to GCS.

on:
workflow_dispatch:
inputs:
hf_dataset:
description: "HF dataset id, e.g. Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-100K"
required: true
default: "Qdrant/gte-multilingual-ads-1M"
output_name:
description: "Output tarball basename"
required: true
default: "ads-gte-multilingual-1M-768-angular"
vector_column:
description: "Embedding column name"
default: "embedding"
default: "gte"

concurrency:
group: hetzner-machines

env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
SERVER_NAME: build-hf-dataset-${{ github.run_id }}
GCS_PATH: gs://ann-filtered-benchmark/datasets/

jobs:
stub:
build:
runs-on: ubuntu-latest
steps:
- run: |
echo "This is the master stub for the Build HF Dataset workflow."
exit 0
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
- uses: webfactory/ssh-agent@d4b9b8ff72958532804b70bbe600ad43b36d5f2e # v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

- name: Setup CI
run: bash -x tools/setup_ci.sh

- name: Create server
uses: ./.github/workflows/actions/create-server-with-retry
with:
server_name: ${{ env.SERVER_NAME }}
server_type: ccx43
region: fsn1

- name: Build dataset on remote
env:
GCS_KEY: ${{ secrets.GCS_KEY }}
GCS_SECRET: ${{ secrets.GCS_SECRET }}
run: |
set -euo pipefail
source tools/ssh.sh
IP=$(bash tools/hetzner/get_public_ip.sh "$SERVER_NAME")

scp_with_retry -o StrictHostKeyChecking=no \
scripts/build_hf_dataset.py "root@$IP:/root/"

ssh_with_retry -o StrictHostKeyChecking=no \
-o ServerAliveInterval=30 -o ServerAliveCountMax=20 "root@$IP" \
GCS_KEY="$GCS_KEY" GCS_SECRET="$GCS_SECRET" bash -s <<EOF
set -euxo pipefail
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y python3-pip

pip3 install --no-cache-dir datasets numpy faiss-cpu boto3

python3 /root/build_hf_dataset.py \
--hf-dataset "${{ inputs.hf_dataset }}" \
--output-name "${{ inputs.output_name }}" \
--vector-column "${{ inputs.vector_column }}" \
--gcs-uri "$GCS_PATH"
EOF

- name: Teardown server
if: always()
continue-on-error: true
run: bash -x tools/hetzner/remove_server.sh "$SERVER_NAME"
10 changes: 5 additions & 5 deletions .github/workflows/manual-benchmarks-cascade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
CONFIG_FILE="benchmark_cascade/benchmark-configs.json"
BENCHMARK_SET="${{ inputs.benchmark_set }}"
CURRENT_BATCH=${{ inputs.current_batch || 0 }}
BATCH_SIZE=10
BATCH_SIZE=12

echo "benchmark_set=$BENCHMARK_SET" >> $GITHUB_OUTPUT

Expand Down Expand Up @@ -202,7 +202,7 @@ jobs:
MACHINES_FIRST=false

SERVER_NAME="benchmark-cascade-server-${i}"
CLIENT_NAME="benchmark-cascade-client-${i}"
CLIENT_NAME="benchmark-cascade-server-${i}"
MACHINES_INFO="${MACHINES_INFO}{\"server_name\":\"$SERVER_NAME\",\"client_name\":\"$CLIENT_NAME\"}"
done

Expand Down Expand Up @@ -252,7 +252,7 @@ jobs:
if: inputs.process_results == false
runs-on: ubuntu-latest
strategy:
max-parallel: 10
max-parallel: 12
fail-fast: false
matrix:
config: ${{ fromJSON(needs.prepareBenchmarks.outputs.matrix) }}
Expand Down Expand Up @@ -291,7 +291,7 @@ jobs:
max_retries: 5

- name: Create Client
if: ${{ inputs.machines_per_bench == true || inputs.current_batch == 0 }}
if: ${{ (inputs.machines_per_bench == true || inputs.current_batch == 0) && steps.extract_names.outputs.client_name != steps.extract_names.outputs.server_name }}
uses: ./.github/workflows/actions/create-server-with-retry
with:
server_name: ${{ steps.extract_names.outputs.client_name }}
Expand All @@ -307,7 +307,7 @@ jobs:
contents: read
packages: read
strategy:
max-parallel: 10
max-parallel: 12
fail-fast: false
matrix:
config: ${{ fromJSON(needs.prepareBenchmarks.outputs.matrix) }}
Expand Down
Loading
Loading