|
| 1 | +"""Docker verification for generated test tasks.""" |
| 2 | + |
| 3 | +import subprocess |
| 4 | +from logging import getLogger |
| 5 | +from pathlib import Path |
| 6 | + |
| 7 | +logger = getLogger(__name__) |
| 8 | + |
| 9 | + |
| 10 | +def generate_run_script(task_dir: Path) -> Path: |
| 11 | + """Generate run_tests.sh script for a task directory.""" |
| 12 | + script_path = task_dir / "run_tests.sh" |
| 13 | + |
| 14 | + script_content = '''#!/bin/bash |
| 15 | +set -e |
| 16 | +
|
| 17 | +# SWE-Forge Test Runner |
| 18 | +# Usage: ./run_tests.sh [--verify] |
| 19 | +
|
| 20 | +TASK_DIR="$(cd "$(dirname "$0")" && pwd)" |
| 21 | +WORKSPACE="$TASK_DIR/workspace.yaml" |
| 22 | +
|
| 23 | +# Parse workspace.yaml (simple grep-based parsing) |
| 24 | +get_value() { |
| 25 | + grep -A1 "$1:" "$WORKSPACE" | tail -1 | sed 's/^[[:space:]]*//' | sed 's/"//g' |
| 26 | +} |
| 27 | +
|
| 28 | +# Get repo info |
| 29 | +REPO_URL=$(grep -A2 "repo:" "$WORKSPACE" | grep "url:" | sed 's/.*url: *//' | sed 's/"//g') |
| 30 | +BASE_COMMIT=$(grep "base_commit:" "$WORKSPACE" | sed 's/.*base_commit: *//' | sed 's/"//g') |
| 31 | +MERGE_COMMIT=$(grep "merge_commit:" "$WORKSPACE" | sed 's/.*merge_commit: *//' | sed 's/"//g') |
| 32 | +
|
| 33 | +echo "=== SWE-Forge Test Runner ===" |
| 34 | +echo "Repo: $REPO_URL" |
| 35 | +echo "Base: $BASE_COMMIT" |
| 36 | +echo "Merge: $MERGE_COMMIT" |
| 37 | +
|
| 38 | +# Get install commands (multiline, until next key) |
| 39 | +get_install_commands() { |
| 40 | + sed -n '/install:/,/^[a-z]/p' "$WORKSPACE" | grep -E "^\\s+-" | sed 's/.*- *//' |
| 41 | +} |
| 42 | +
|
| 43 | +# Get fail_to_pass tests |
| 44 | +get_fail_to_pass() { |
| 45 | + sed -n '/fail_to_pass:/,/pass_to_pass:/p' "$WORKSPACE" | grep -E "^\\s+-" | sed 's/.*- *//' |
| 46 | +} |
| 47 | +
|
| 48 | +# Get pass_to_pass tests |
| 49 | +get_pass_to_pass() { |
| 50 | + sed -n '/pass_to_pass:/,/^[a-z]/p' "$WORKSPACE" | grep -E "^\\s+-" | sed 's/.*- *//' |
| 51 | +} |
| 52 | +
|
| 53 | +echo "" |
| 54 | +echo "=== Install Commands ===" |
| 55 | +get_install_commands |
| 56 | +
|
| 57 | +echo "" |
| 58 | +echo "=== Tests (fail_to_pass) ===" |
| 59 | +get_fail_to_pass |
| 60 | +
|
| 61 | +# Run in Docker container |
| 62 | +if [ "$1" == "--verify" ]; then |
| 63 | + echo "" |
| 64 | + echo "=== Running in Docker ===" |
| 65 | + |
| 66 | + IMAGE=$(grep "image:" "$WORKSPACE" | head -1 | sed 's/.*image: *//' | sed 's/"//g') |
| 67 | + if [ -z "$IMAGE" ]; then |
| 68 | + IMAGE="ubuntu:24.04" |
| 69 | + fi |
| 70 | + |
| 71 | + echo "Using image: $IMAGE" |
| 72 | + |
| 73 | + # Run Docker container with tests |
| 74 | + docker run --rm -v "$TASK_DIR:/task" -w /repo "$IMAGE" bash -c " |
| 75 | + # Install git if needed |
| 76 | + apt-get update && apt-get install -y git python3 python3-pip > /dev/null 2>&1 |
| 77 | + |
| 78 | + # Clone repo |
| 79 | + git clone $REPO_URL /repo 2>/dev/null || true |
| 80 | + cd /repo |
| 81 | + |
| 82 | + # Apply patch if exists |
| 83 | + if [ -f /task/patch.diff ]; then |
| 84 | + git checkout $BASE_COMMIT 2>/dev/null |
| 85 | + git apply /task/patch.diff || echo 'Patch may already be applied' |
| 86 | + fi |
| 87 | + |
| 88 | + # Run install commands |
| 89 | + get_install_commands | while read cmd; do |
| 90 | + echo 'Running: '\$cmd |
| 91 | + eval \$cmd |
| 92 | + done |
| 93 | + |
| 94 | + # Run fail_to_pass tests |
| 95 | + echo '' |
| 96 | + echo '=== Running fail_to_pass tests ===' |
| 97 | + get_fail_to_pass | while read test_cmd; do |
| 98 | + echo 'Test: '\$test_cmd |
| 99 | + done |
| 100 | + " |
| 101 | +fi |
| 102 | +
|
| 103 | +echo "" |
| 104 | +echo "Done. To verify in Docker, run: ./run_tests.sh --verify" |
| 105 | +''' |
| 106 | + |
| 107 | + with open(script_path, "w") as f: |
| 108 | + f.write(script_content) |
| 109 | + |
| 110 | + # Make executable |
| 111 | + script_path.chmod(0o755) |
| 112 | + return script_path |
| 113 | + |
| 114 | + |
| 115 | +def verify_task_in_docker(task_dir: Path, timeout: int = 300) -> dict: |
| 116 | + """Verify a task by running tests in Docker. |
| 117 | + |
| 118 | + Returns: |
| 119 | + dict with keys: success, output, error |
| 120 | + """ |
| 121 | + import tempfile |
| 122 | + import os |
| 123 | + |
| 124 | + workspace_path = task_dir / "workspace.yaml" |
| 125 | + if not workspace_path.exists(): |
| 126 | + return {"success": False, "error": "No workspace.yaml found"} |
| 127 | + |
| 128 | + # Generate run script |
| 129 | + script_path = generate_run_script(task_dir) |
| 130 | + |
| 131 | + # Read workspace for config |
| 132 | + import yaml |
| 133 | + with open(workspace_path) as f: |
| 134 | + config = yaml.safe_load(f) |
| 135 | + |
| 136 | + repo_url = config.get("repo", {}).get("url", "") |
| 137 | + base_commit = config.get("repo", {}).get("base_commit", "") |
| 138 | + install_commands = config.get("install", {}).get("commands", []) |
| 139 | + fail_to_pass = config.get("tests", {}).get("fail_to_pass", []) |
| 140 | + |
| 141 | + if not repo_url: |
| 142 | + return {"success": False, "error": "No repo URL in workspace.yaml"} |
| 143 | + |
| 144 | + # Build docker run command |
| 145 | + image = config.get("environment", {}).get("image", "ubuntu:24.04") |
| 146 | + |
| 147 | + # Create verification script |
| 148 | + verify_script = f'''#!/bin/bash |
| 149 | +set -e |
| 150 | +
|
| 151 | +echo "=== Cloning repo ===" |
| 152 | +apt-get update > /dev/null 2>&1 |
| 153 | +apt-get install -y git python3 python3-pip > /dev/null 2>&1 |
| 154 | +
|
| 155 | +git clone {repo_url} /repo 2>/dev/null |
| 156 | +cd /repo |
| 157 | +git checkout {base_commit} 2>/dev/null || true |
| 158 | +
|
| 159 | +echo "=== Applying patch ===" |
| 160 | +if [ -f /task/patch.diff ]; then |
| 161 | + git apply /task/patch.diff 2>/dev/null || echo "Patch applied or already present" |
| 162 | +fi |
| 163 | +
|
| 164 | +echo "=== Running install commands ===" |
| 165 | +''' |
| 166 | + |
| 167 | + for cmd in install_commands[:5]: # Limit to first 5 install commands |
| 168 | + verify_script += f''' |
| 169 | +echo "Running: {cmd}" |
| 170 | +{cmd} || echo "Install command may have failed (exit $?)" |
| 171 | +''' |
| 172 | + |
| 173 | + verify_script += ''' |
| 174 | +echo "=== Listing test files ===" |
| 175 | +ls -la /task/tests/ 2>/dev/null || echo "No tests directory" |
| 176 | +
|
| 177 | +echo "=== Running tests ===" |
| 178 | +''' |
| 179 | + |
| 180 | + for test_cmd in fail_to_pass[:3]: # Limit to first 3 tests |
| 181 | + verify_script += f''' |
| 182 | +echo "Running: {test_cmd}" |
| 183 | +{test_cmd} 2>&1 || echo "Test failed (expected on base commit)" |
| 184 | +''' |
| 185 | + |
| 186 | + # Write verify script |
| 187 | + script_path = task_dir / "verify_docker.sh" |
| 188 | + with open(script_path, "w") as f: |
| 189 | + f.write(verify_script) |
| 190 | + script_path.chmod(0o755) |
| 191 | + |
| 192 | + # Run docker |
| 193 | + try: |
| 194 | + result = subprocess.run( |
| 195 | + ["docker", "run", "--rm", |
| 196 | + "-v", f"{task_dir}:/task", |
| 197 | + "-w", "/repo", |
| 198 | + "--timeout", str(timeout), |
| 199 | + image, |
| 200 | + "bash", "/task/verify_docker.sh"], |
| 201 | + capture_output=True, |
| 202 | + text=True, |
| 203 | + timeout=timeout |
| 204 | + ) |
| 205 | + |
| 206 | + return { |
| 207 | + "success": result.returncode == 0, |
| 208 | + "output": result.stdout, |
| 209 | + "error": result.stderr |
| 210 | + } |
| 211 | + except subprocess.TimeoutExpired: |
| 212 | + return {"success": False, "error": "Timeout"} |
| 213 | + except Exception as e: |
| 214 | + return {"success": False, "error": str(e)} |
0 commit comments