Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
interface:
display_name: "gstack-setup-browser-cookies"
short_description: "Import cookies from your real browser (Comet, Chrome, Arc, Brave, Edge) into the headless browse session. Opens an..."
short_description: "Import cookies from your real Chromium browser into the headless browse session. Opens an interactive picker UI..."
default_prompt: "Use gstack-setup-browser-cookies for this task."
policy:
allow_implicit_invocation: true
4 changes: 4 additions & 0 deletions .github/actionlint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
self-hosted-runner:
labels:
- ubicloud-standard-2
- ubicloud-standard-8
63 changes: 63 additions & 0 deletions .github/docker/Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# gstack CI eval runner — pre-baked toolchain + deps
# Rebuild weekly via ci-image.yml, on Dockerfile changes, or on lockfile changes
FROM ubuntu:24.04

ENV DEBIAN_FRONTEND=noninteractive

# System deps
RUN apt-get update && apt-get install -y --no-install-recommends \
git curl unzip ca-certificates jq bc gpg \
&& rm -rf /var/lib/apt/lists/*

# GitHub CLI
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
| gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \
| tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
&& apt-get update && apt-get install -y --no-install-recommends gh \
&& rm -rf /var/lib/apt/lists/*

# Node.js 22 LTS (needed for claude CLI)
RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
&& apt-get install -y --no-install-recommends nodejs \
&& rm -rf /var/lib/apt/lists/*

# Bun (install to /usr/local so non-root users can access it)
ENV BUN_INSTALL="/usr/local"
RUN curl -fsSL https://bun.sh/install | bash

# Claude CLI
RUN npm i -g @anthropic-ai/claude-code

# Playwright system deps (Chromium) — needed for browse E2E tests
RUN npx playwright install-deps chromium

# Pre-install dependencies (cached layer — only rebuilds when package.json changes)
COPY package.json /workspace/
WORKDIR /workspace
RUN bun install && rm -rf /tmp/*

# Install Playwright Chromium to a shared location accessible by all users
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
RUN npx playwright install chromium \
&& chmod -R a+rX /opt/playwright-browsers

# Verify everything works
RUN bun --version && node --version && claude --version && jq --version && gh --version \
&& npx playwright --version

# At runtime: checkout overwrites /workspace, but node_modules persists
# if we move it out of the way and symlink back
# Save node_modules + package.json snapshot for cache validation at runtime
RUN mv /workspace/node_modules /opt/node_modules_cache \
&& cp /workspace/package.json /opt/node_modules_cache/.package.json

# Claude CLI refuses --dangerously-skip-permissions as root.
# Create a non-root user for eval runs (GH Actions overrides USER, so
# the workflow must set options.user or use gosu/su-exec at runtime).
RUN useradd -m -s /bin/bash runner \
&& chmod -R a+rX /opt/node_modules_cache \
&& mkdir -p /home/runner/.gstack && chown -R runner:runner /home/runner/.gstack \
&& chmod 1777 /tmp \
&& mkdir -p /home/runner/.bun && chown -R runner:runner /home/runner/.bun \
&& chmod -R 1777 /tmp
8 changes: 8 additions & 0 deletions .github/workflows/actionlint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: Workflow Lint
on: [push, pull_request]
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: rhysd/actionlint@v1.7.11
40 changes: 40 additions & 0 deletions .github/workflows/ci-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Build CI Image
on:
# Rebuild weekly (Monday 6am UTC) to pick up CLI updates
schedule:
- cron: '0 6 * * 1'
# Rebuild on Dockerfile or lockfile changes
push:
branches: [main]
paths:
- '.github/docker/Dockerfile.ci'
- 'package.json'
# Manual trigger
workflow_dispatch:

jobs:
build:
runs-on: ubicloud-standard-2
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4

# Copy lockfile + package.json into Docker build context
- run: cp package.json .github/docker/

- uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- uses: docker/build-push-action@v6
with:
context: .github/docker
file: .github/docker/Dockerfile.ci
push: true
tags: |
ghcr.io/${{ github.repository }}/ci:latest
ghcr.io/${{ github.repository }}/ci:${{ github.sha }}
242 changes: 242 additions & 0 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
name: E2E Evals
on:
pull_request:
branches: [main]
workflow_dispatch:

concurrency:
group: evals-${{ github.head_ref }}
cancel-in-progress: true

env:
IMAGE: ghcr.io/${{ github.repository }}/ci

jobs:
# Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change)
build-image:
runs-on: ubicloud-standard-2
permissions:
contents: read
packages: write
outputs:
image-tag: ${{ steps.meta.outputs.tag }}
steps:
- uses: actions/checkout@v4

- id: meta
run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT"

- uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Check if image exists
id: check
run: |
if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
echo "exists=true" >> "$GITHUB_OUTPUT"
else
echo "exists=false" >> "$GITHUB_OUTPUT"
fi

- if: steps.check.outputs.exists == 'false'
run: cp package.json .github/docker/

- if: steps.check.outputs.exists == 'false'
uses: docker/build-push-action@v6
with:
context: .github/docker
file: .github/docker/Dockerfile.ci
push: true
tags: |
${{ steps.meta.outputs.tag }}
${{ env.IMAGE }}:latest

evals:
runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }}
needs: build-image
container:
image: ${{ needs.build-image.outputs.image-tag }}
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: --user runner
timeout-minutes: 25
strategy:
fail-fast: false
matrix:
suite:
- name: llm-judge
file: test/skill-llm-eval.test.ts
- name: e2e-browse
file: test/skill-e2e-bws.test.ts
runner: ubicloud-standard-8
- name: e2e-plan
file: test/skill-e2e-plan.test.ts
- name: e2e-deploy
file: test/skill-e2e-deploy.test.ts
- name: e2e-design
file: test/skill-e2e-design.test.ts
- name: e2e-qa-bugs
file: test/skill-e2e-qa-bugs.test.ts
- name: e2e-qa-workflow
file: test/skill-e2e-qa-workflow.test.ts
- name: e2e-review
file: test/skill-e2e-review.test.ts
- name: e2e-workflow
file: test/skill-e2e-workflow.test.ts
allow_failure: true # /ship + /setup-browser-cookies are env-dependent
- name: e2e-routing
file: test/skill-routing-e2e.test.ts
allow_failure: true # LLM routing is non-deterministic
- name: e2e-codex
file: test/codex-e2e.test.ts
- name: e2e-gemini
file: test/gemini-e2e.test.ts
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

# Bun creates root-owned temp dirs during Docker build. GH Actions runs as
# runner user with HOME=/github/home. Redirect bun's cache to a writable dir.
- name: Fix bun temp
run: |
mkdir -p /home/runner/.cache/bun
{
echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun"
echo "BUN_TMPDIR=/home/runner/.cache/bun"
echo "TMPDIR=/home/runner/.cache"
} >> "$GITHUB_ENV"

# Restore pre-installed node_modules from Docker image via symlink (~0s vs ~15s install)
- name: Restore deps
run: |
if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
ln -s /opt/node_modules_cache node_modules
else
bun install
fi

- run: bun run build

# Verify Playwright can launch Chromium (fails fast if sandbox/deps are broken)
- name: Verify Chromium
if: matrix.suite.name == 'e2e-browse'
run: |
echo "whoami=$(whoami) HOME=$HOME TMPDIR=${TMPDIR:-unset}"
touch /tmp/.bun-test && rm /tmp/.bun-test && echo "/tmp writable"
bun -e "import {chromium} from 'playwright';const b=await chromium.launch({args:['--no-sandbox']});console.log('Chromium OK');await b.close()"

- name: Run ${{ matrix.suite.name }}
continue-on-error: ${{ matrix.suite.allow_failure || false }}
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
EVALS_CONCURRENCY: "40"
PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers
run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}

- name: Upload eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-${{ matrix.suite.name }}
path: ~/.gstack-dev/evals/*.json
retention-days: 90

report:
runs-on: ubicloud-standard-2
needs: evals
if: always() && github.event_name == 'pull_request'
timeout-minutes: 5
permissions:
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1

- name: Download all eval artifacts
uses: actions/download-artifact@v4
with:
pattern: eval-*
path: /tmp/eval-results
merge-multiple: true

- name: Post PR comment
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# shellcheck disable=SC2086,SC2059
RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
if [ -z "$RESULTS" ]; then
echo "No eval results found"
exit 0
fi

TOTAL=0; PASSED=0; FAILED=0; COST="0"
SUITE_LINES=""
for f in $RESULTS; do
if ! jq -e '.total_tests' "$f" >/dev/null 2>&1; then
echo "Skipping malformed JSON: $f"
continue
fi
T=$(jq -r '.total_tests // 0' "$f")
P=$(jq -r '.passed // 0' "$f")
F=$(jq -r '.failed // 0' "$f")
C=$(jq -r '.total_cost_usd // 0' "$f")
TIER=$(jq -r '.tier // "unknown"' "$f")
[ "$T" -eq 0 ] && continue
TOTAL=$((TOTAL + T))
PASSED=$((PASSED + P))
FAILED=$((FAILED + F))
COST=$(echo "$COST + $C" | bc)
STATUS_ICON="✅"
[ "$F" -gt 0 ] && STATUS_ICON="❌"
SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
done

STATUS="✅ PASS"
[ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"

BODY="## E2E Evals: ${STATUS}

**${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**

| Suite | Result | Status | Cost |
|-------|--------|--------|------|
$(echo -e "$SUITE_LINES")

---
*12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"

if [ "$FAILED" -gt 0 ]; then
FAILURES=""
for f in $RESULTS; do
if ! jq -e '.failed' "$f" >/dev/null 2>&1; then continue; fi
F=$(jq -r '.failed // 0' "$f")
[ "$F" -eq 0 ] && continue
FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f" 2>/dev/null || echo "- ⚠️ $(basename "$f"): parse error")
FAILURES="${FAILURES}${FAILS}\n"
done
BODY="${BODY}

### Failures
$(echo -e "$FAILURES")"
fi

# Update existing comment or create new one
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
--jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)

if [ -n "$COMMENT_ID" ]; then
gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" \
-X PATCH -f body="$BODY"
else
gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"
fi
15 changes: 13 additions & 2 deletions .github/workflows/skill-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@ jobs:
- run: bun install
- name: Check Claude host freshness
run: bun run gen:skill-docs
- run: git diff --exit-code || (echo "Generated SKILL.md files are stale. Run: bun run gen:skill-docs" && exit 1)
- name: Check Codex host generation succeeds
- name: Verify Claude skill docs are fresh
run: |
git diff --exit-code || {
echo "Generated SKILL.md files are stale. Run: bun run gen:skill-docs"
exit 1
}
- name: Check Codex host freshness
run: bun run gen:skill-docs --host codex
- name: Verify Codex skill docs are fresh
run: |
git diff --exit-code -- .agents/ || {
echo "Generated Codex SKILL.md files are stale. Run: bun run gen:skill-docs --host codex"
exit 1
}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ bin/gstack-global-discover
.claude/skills/
.agents/
.context/
.gstack-worktrees/
/tmp/
*.log
bun.lock
Expand Down
Loading