Skip to content

Commit 7bb944b

Browse files
committed
Add mirror command and API for selective package mirroring
Add a `proxy mirror` CLI command and `/api/mirror` API endpoints that pre-populate the cache from various input sources: individual PURLs, SBOM files (CycloneDX and SPDX), or full registry enumeration. The mirror reuses the existing handler.Proxy.GetOrFetchArtifact() pipeline so cached artifacts are identical to those fetched on demand. A bounded worker pool controls download parallelism. Metadata caching is opt-in via `cache_metadata: true` in config (or PROXY_CACHE_METADATA=true). The mirror command always enables it. When enabled, upstream metadata responses are stored for offline fallback with ETag-based conditional revalidation. New internal/mirror package with Source interface, PURLSource, SBOMSource, RegistrySource, and async JobStore. New metadata_cache database table for offline metadata serving.
1 parent 4a78292 commit 7bb944b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2398
-240
lines changed

README.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,47 @@ proxy serve [flags]
460460
proxy [flags] # same as 'proxy serve'
461461
```
462462

463+
### mirror
464+
465+
Pre-populate the cache from PURLs, SBOM files, or entire registries. Useful for ensuring offline availability or warming the cache before deployments.
466+
467+
```bash
468+
# Mirror specific package versions
469+
proxy mirror pkg:npm/lodash@4.17.21 pkg:cargo/serde@1.0.0
470+
471+
# Mirror all versions of a package
472+
proxy mirror pkg:npm/lodash
473+
474+
# Mirror from a CycloneDX or SPDX SBOM
475+
proxy mirror --sbom sbom.cdx.json
476+
477+
# Full registry mirror (npm, pypi, cargo supported)
478+
proxy mirror --registry npm
479+
480+
# Preview what would be mirrored
481+
proxy mirror --dry-run pkg:npm/lodash
482+
483+
# Control parallelism
484+
proxy mirror --concurrency 8 pkg:npm/lodash@4.17.21
485+
```
486+
487+
The mirror command accepts the same storage and database flags as `serve`. Already-cached artifacts are skipped.
488+
489+
A mirror API is also available when the server is running:
490+
491+
```bash
492+
# Start a mirror job
493+
curl -X POST http://localhost:8080/api/mirror \
494+
-H "Content-Type: application/json" \
495+
-d '{"purls": ["pkg:npm/lodash@4.17.21"]}'
496+
497+
# Check job status
498+
curl http://localhost:8080/api/mirror/mirror-1
499+
500+
# Cancel a running job
501+
curl -X DELETE http://localhost:8080/api/mirror/mirror-1
502+
```
503+
463504
### stats
464505

465506
Show cache statistics without running the server.
@@ -534,6 +575,14 @@ Recently cached:
534575
| `GET /debian/*` | Debian/APT repository protocol |
535576
| `GET /rpm/*` | RPM/Yum repository protocol |
536577

578+
### Mirror API
579+
580+
| Endpoint | Description |
581+
|----------|-------------|
582+
| `POST /api/mirror` | Start a mirror job (JSON body with `purls` or `registry`) |
583+
| `GET /api/mirror/{id}` | Get job status and progress |
584+
| `DELETE /api/mirror/{id}` | Cancel a running job |
585+
537586
### Enrichment API
538587

539588
The proxy provides REST endpoints for package metadata enrichment, vulnerability scanning, and outdated detection.

cmd/proxy/main.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
//
1717
// serve Start the proxy server (default if no command given)
1818
// stats Show cache statistics
19+
// mirror Pre-populate cache from PURLs, SBOMs, or registries
1920
//
2021
// Serve Flags:
2122
//
@@ -100,7 +101,11 @@ import (
100101

101102
"github.com/git-pkgs/proxy/internal/config"
102103
"github.com/git-pkgs/proxy/internal/database"
104+
"github.com/git-pkgs/proxy/internal/handler"
105+
"github.com/git-pkgs/proxy/internal/mirror"
103106
"github.com/git-pkgs/proxy/internal/server"
107+
"github.com/git-pkgs/proxy/internal/storage"
108+
"github.com/git-pkgs/registries/fetch"
104109
)
105110

106111
const defaultTopN = 10
@@ -124,6 +129,10 @@ func main() {
124129
os.Args = append(os.Args[:1], os.Args[2:]...)
125130
runStats()
126131
return
132+
case "mirror":
133+
os.Args = append(os.Args[:1], os.Args[2:]...)
134+
runMirror()
135+
return
127136
case "-version", "--version":
128137
fmt.Printf("proxy %s (%s)\n", Version, Commit)
129138
os.Exit(0)
@@ -145,6 +154,7 @@ Usage: proxy [command] [flags]
145154
Commands:
146155
serve Start the proxy server (default)
147156
stats Show cache statistics
157+
mirror Pre-populate cache from PURLs, SBOMs, or registries
148158
149159
Run 'proxy <command> -help' for more information on a command.
150160
@@ -340,6 +350,158 @@ func runStats() {
340350
}
341351
}
342352

353+
func runMirror() {
354+
fs := flag.NewFlagSet("mirror", flag.ExitOnError)
355+
configPath := fs.String("config", "", "Path to configuration file")
356+
storageURL := fs.String("storage-url", "", "Storage URL (file:// or s3://)")
357+
databaseDriver := fs.String("database-driver", "", "Database driver: sqlite or postgres")
358+
databasePath := fs.String("database-path", "", "Path to SQLite database file")
359+
databaseURL := fs.String("database-url", "", "PostgreSQL connection URL")
360+
sbomPath := fs.String("sbom", "", "Path to CycloneDX or SPDX SBOM file")
361+
registry := fs.String("registry", "", "Ecosystem name for full registry mirror")
362+
concurrency := fs.Int("concurrency", 4, "Number of parallel downloads") //nolint:mnd // default concurrency
363+
dryRun := fs.Bool("dry-run", false, "Show what would be mirrored without downloading")
364+
365+
fs.Usage = func() {
366+
fmt.Fprintf(os.Stderr, "git-pkgs proxy - Pre-populate cache\n\n")
367+
fmt.Fprintf(os.Stderr, "Usage: proxy mirror [flags] [purl...]\n\n")
368+
fmt.Fprintf(os.Stderr, "Examples:\n")
369+
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash@4.17.21\n")
370+
fmt.Fprintf(os.Stderr, " proxy mirror --sbom sbom.cdx.json\n")
371+
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash # all versions\n")
372+
fmt.Fprintf(os.Stderr, " proxy mirror --registry npm\n\n")
373+
fmt.Fprintf(os.Stderr, "Flags:\n")
374+
fs.PrintDefaults()
375+
}
376+
377+
_ = fs.Parse(os.Args[1:])
378+
purls := fs.Args()
379+
380+
// Determine source
381+
var source mirror.Source
382+
switch {
383+
case *sbomPath != "":
384+
source = &mirror.SBOMSource{Path: *sbomPath}
385+
case *registry != "":
386+
source = &mirror.RegistrySource{Ecosystem: *registry}
387+
case len(purls) > 0:
388+
source = &mirror.PURLSource{PURLs: purls}
389+
default:
390+
fmt.Fprintf(os.Stderr, "error: provide PURLs, --sbom, or --registry\n")
391+
fs.Usage()
392+
os.Exit(1)
393+
}
394+
395+
// Load config
396+
cfg, err := loadConfig(*configPath)
397+
if err != nil {
398+
fmt.Fprintf(os.Stderr, "error loading config: %v\n", err)
399+
os.Exit(1)
400+
}
401+
cfg.LoadFromEnv()
402+
403+
if *storageURL != "" {
404+
cfg.Storage.URL = *storageURL
405+
}
406+
if *databaseDriver != "" {
407+
cfg.Database.Driver = *databaseDriver
408+
}
409+
if *databasePath != "" {
410+
cfg.Database.Path = *databasePath
411+
}
412+
if *databaseURL != "" {
413+
cfg.Database.URL = *databaseURL
414+
}
415+
416+
if err := cfg.Validate(); err != nil {
417+
fmt.Fprintf(os.Stderr, "invalid configuration: %v\n", err)
418+
os.Exit(1)
419+
}
420+
421+
logger := setupLogger("info", "text")
422+
423+
// Open database
424+
var db *database.DB
425+
switch cfg.Database.Driver {
426+
case "postgres":
427+
db, err = database.OpenPostgresOrCreate(cfg.Database.URL)
428+
default:
429+
db, err = database.OpenOrCreate(cfg.Database.Path)
430+
}
431+
if err != nil {
432+
fmt.Fprintf(os.Stderr, "error opening database: %v\n", err)
433+
os.Exit(1)
434+
}
435+
436+
if err := db.MigrateSchema(); err != nil {
437+
_ = db.Close()
438+
fmt.Fprintf(os.Stderr, "error migrating schema: %v\n", err)
439+
os.Exit(1)
440+
}
441+
442+
// Open storage
443+
sURL := cfg.Storage.URL
444+
if sURL == "" {
445+
sURL = "file://" + cfg.Storage.Path //nolint:staticcheck // backwards compat
446+
}
447+
store, err := storage.OpenBucket(context.Background(), sURL)
448+
if err != nil {
449+
_ = db.Close()
450+
fmt.Fprintf(os.Stderr, "error opening storage: %v\n", err)
451+
os.Exit(1)
452+
}
453+
454+
// Build proxy (reuses same pipeline as serve)
455+
fetcher := fetch.NewFetcher()
456+
resolver := fetch.NewResolver()
457+
proxy := handler.NewProxy(db, store, fetcher, resolver, logger)
458+
proxy.CacheMetadata = true // mirror always caches metadata
459+
460+
m := mirror.New(proxy, db, store, logger, *concurrency)
461+
462+
ctx, cancel := context.WithCancel(context.Background())
463+
go func() {
464+
sigCh := make(chan os.Signal, 1)
465+
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
466+
<-sigCh
467+
cancel()
468+
}()
469+
470+
if *dryRun {
471+
items, err := m.RunDryRun(ctx, source)
472+
if err != nil {
473+
_ = db.Close()
474+
fmt.Fprintf(os.Stderr, "error: %v\n", err)
475+
os.Exit(1)
476+
}
477+
fmt.Printf("Would mirror %d package versions:\n", len(items))
478+
for _, item := range items {
479+
fmt.Printf(" %s\n", item)
480+
}
481+
_ = db.Close()
482+
return
483+
}
484+
485+
progress, err := m.Run(ctx, source)
486+
if err != nil {
487+
_ = db.Close()
488+
fmt.Fprintf(os.Stderr, "error: %v\n", err)
489+
os.Exit(1)
490+
}
491+
492+
_ = db.Close()
493+
494+
fmt.Printf("Mirror complete: %d downloaded, %d skipped (cached), %d failed, %s total\n",
495+
progress.Completed, progress.Skipped, progress.Failed, formatSize(progress.Bytes))
496+
497+
if len(progress.Errors) > 0 {
498+
fmt.Fprintf(os.Stderr, "\nErrors:\n")
499+
for _, e := range progress.Errors {
500+
fmt.Fprintf(os.Stderr, " %s/%s@%s: %s\n", e.Ecosystem, e.Name, e.Version, e.Error)
501+
}
502+
}
503+
}
504+
343505
func printStats(db *database.DB, popular, recent int, asJSON bool) error {
344506
defer func() { _ = db.Close() }()
345507

docs/architecture.md

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,20 @@ vulnerabilities (
161161
updated_at DATETIME
162162
)
163163
-- indexes: (vuln_id, ecosystem, package_name) unique, (ecosystem, package_name)
164+
165+
metadata_cache (
166+
id INTEGER PRIMARY KEY,
167+
ecosystem TEXT NOT NULL,
168+
name TEXT NOT NULL,
169+
storage_path TEXT NOT NULL,
170+
etag TEXT,
171+
content_type TEXT,
172+
size INTEGER, -- BIGINT on Postgres
173+
fetched_at DATETIME,
174+
created_at DATETIME,
175+
updated_at DATETIME
176+
)
177+
-- indexes: (ecosystem, name) unique
164178
```
165179

166180
On PostgreSQL, `INTEGER PRIMARY KEY` becomes `SERIAL`, `DATETIME` becomes `TIMESTAMP`, `INTEGER DEFAULT 0` booleans become `BOOLEAN DEFAULT FALSE`, and size/count columns use `BIGINT`.
@@ -277,6 +291,12 @@ Version age filtering for supply chain attack mitigation. Configurable at global
277291

278292
Package metadata enrichment. Fetches license, description, homepage, repository URL, and vulnerability data from upstream registries. Powers the `/api/` endpoints and the web UI's package detail pages.
279293

294+
### `internal/mirror`
295+
296+
Selective package mirroring for pre-populating the proxy cache. Supports multiple input sources: individual PURLs (versioned or unversioned), CycloneDX/SPDX SBOM files, and full registry enumeration. Uses a bounded worker pool backed by `errgroup` to download artifacts in parallel, reusing `handler.Proxy.GetOrFetchArtifact()` for the actual fetch-and-cache work.
297+
298+
The package also provides a `MetadataCache` for storing raw upstream metadata blobs so the proxy can serve metadata responses offline. The `JobStore` manages async mirror jobs exposed via the `/api/mirror` endpoints.
299+
280300
### `internal/config`
281301

282302
Configuration loading.
@@ -326,10 +346,11 @@ Eviction can be implemented as:
326346
- Ensures clients fetch artifacts through proxy
327347
- Alternative: Let clients fetch directly, miss cache opportunity
328348

329-
**Why not cache metadata?**
349+
**Why not cache metadata (by default)?**
330350
- Simplicity - no invalidation logic needed
331351
- Fresh data - new versions visible immediately
332352
- Metadata is small, upstream fetch is fast
353+
- Set `cache_metadata: true` or use the mirror command to enable metadata caching for offline use via the `metadata_cache` table
333354

334355
**Why stream artifacts?**
335356
- Memory efficient - don't load large files into RAM

docs/configuration.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,40 @@ Resolution order: package override, then ecosystem override, then global default
211211

212212
Currently supported for npm, PyPI, pub.dev, and Composer. These ecosystems include publish timestamps in their metadata. Other ecosystems (Go, Cargo, RubyGems) would require extra API calls and are not yet supported.
213213

214+
## Metadata Caching
215+
216+
By default the proxy fetches metadata fresh from upstream on every request. Enable `cache_metadata` to store metadata responses in the database and storage backend for offline fallback. When upstream is unreachable, the proxy serves the last cached copy. ETag-based revalidation avoids re-downloading unchanged metadata.
217+
218+
```yaml
219+
cache_metadata: true
220+
```
221+
222+
Or via environment variable: `PROXY_CACHE_METADATA=true`.
223+
224+
The `proxy mirror` command always enables metadata caching regardless of this setting.
225+
226+
## Mirror Command
227+
228+
The `proxy mirror` command pre-populates the cache from various sources. It accepts the same storage and database flags as `serve`.
229+
230+
| Flag | Default | Description |
231+
|------|---------|-------------|
232+
| `--sbom` | | Path to CycloneDX or SPDX SBOM file |
233+
| `--registry` | | Ecosystem name for full registry mirror |
234+
| `--concurrency` | `4` | Number of parallel downloads |
235+
| `--dry-run` | `false` | Show what would be mirrored without downloading |
236+
| `--config` | | Path to configuration file |
237+
| `--storage-url` | | Storage URL |
238+
| `--database-driver` | | Database driver |
239+
| `--database-path` | | SQLite database file |
240+
| `--database-url` | | PostgreSQL connection URL |
241+
242+
Positional arguments are treated as PURLs:
243+
244+
```bash
245+
proxy mirror pkg:npm/lodash@4.17.21 pkg:cargo/serde@1.0.0
246+
```
247+
214248
## Docker
215249

216250
### SQLite with Local Storage

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ require (
3636
github.com/Antonboom/nilnil v1.1.1 // indirect
3737
github.com/Antonboom/testifylint v1.6.4 // indirect
3838
github.com/BurntSushi/toml v1.6.0 // indirect
39+
github.com/CycloneDX/cyclonedx-go v0.10.0 // indirect
3940
github.com/Djarvur/go-err113 v0.1.1 // indirect
4041
github.com/KyleBanks/depth v1.2.1 // indirect
4142
github.com/Masterminds/semver/v3 v3.4.0 // indirect
@@ -50,6 +51,7 @@ require (
5051
github.com/alfatraining/structtag v1.0.0 // indirect
5152
github.com/alingse/asasalint v0.0.11 // indirect
5253
github.com/alingse/nilnesserr v0.2.0 // indirect
54+
github.com/anchore/go-struct-converter v0.1.0 // indirect
5355
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
5456
github.com/ashanbrown/forbidigo/v2 v2.3.0 // indirect
5557
github.com/ashanbrown/makezero/v2 v2.1.0 // indirect
@@ -227,6 +229,7 @@ require (
227229
github.com/sivchari/containedctx v1.0.3 // indirect
228230
github.com/sonatard/noctx v0.4.0 // indirect
229231
github.com/sourcegraph/go-diff v0.7.0 // indirect
232+
github.com/spdx/tools-golang v0.5.7 // indirect
230233
github.com/spf13/afero v1.15.0 // indirect
231234
github.com/spf13/cast v1.5.0 // indirect
232235
github.com/spf13/cobra v1.10.2 // indirect

go.sum

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ github.com/Antonboom/testifylint v1.6.4 h1:gs9fUEy+egzxkEbq9P4cpcMB6/G0DYdMeiFS8
4545
github.com/Antonboom/testifylint v1.6.4/go.mod h1:YO33FROXX2OoUfwjz8g+gUxQXio5i9qpVy7nXGbxDD4=
4646
github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk=
4747
github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
48+
github.com/CycloneDX/cyclonedx-go v0.10.0 h1:7xyklU7YD+CUyGzSFIARG18NYLsKVn4QFg04qSsu+7Y=
49+
github.com/CycloneDX/cyclonedx-go v0.10.0/go.mod h1:vUvbCXQsEm48OI6oOlanxstwNByXjCZ2wuleUlwGEO8=
4850
github.com/Djarvur/go-err113 v0.1.1 h1:eHfopDqXRwAi+YmCUas75ZE0+hoBHJ2GQNLYRSxao4g=
4951
github.com/Djarvur/go-err113 v0.1.1/go.mod h1:IaWJdYFLg76t2ihfflPZnM1LIQszWOsFDh2hhhAVF6k=
5052
github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 h1:sBEjpZlNHzK1voKq9695PJSX2o5NEXl7/OL3coiIY0c=
@@ -84,6 +86,8 @@ github.com/alingse/asasalint v0.0.11 h1:SFwnQXJ49Kx/1GghOFz1XGqHYKp21Kq1nHad/0WQ
8486
github.com/alingse/asasalint v0.0.11/go.mod h1:nCaoMhw7a9kSJObvQyVzNTPBDbNpdocqrSP7t/cW5+I=
8587
github.com/alingse/nilnesserr v0.2.0 h1:raLem5KG7EFVb4UIDAXgrv3N2JIaffeKNtcEXkEWd/w=
8688
github.com/alingse/nilnesserr v0.2.0/go.mod h1:1xJPrXonEtX7wyTq8Dytns5P2hNzoWymVUIaKm4HNFg=
89+
github.com/anchore/go-struct-converter v0.1.0 h1:2rDRssAl6mgKBSLNiVCMADgZRhoqtw9dedlWa0OhD30=
90+
github.com/anchore/go-struct-converter v0.1.0/go.mod h1:rYqSE9HbjzpHTI74vwPvae4ZVYZd1lue2ta6xHPdblA=
8791
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
8892
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
8993
github.com/ashanbrown/forbidigo/v2 v2.3.0 h1:OZZDOchCgsX5gvToVtEBoV2UWbFfI6RKQTir2UZzSxo=
@@ -562,6 +566,8 @@ github.com/sonatard/noctx v0.4.0 h1:7MC/5Gg4SQ4lhLYR6mvOP6mQVSxCrdyiExo7atBs27o=
562566
github.com/sonatard/noctx v0.4.0/go.mod h1:64XdbzFb18XL4LporKXp8poqZtPKbCrqQ402CV+kJas=
563567
github.com/sourcegraph/go-diff v0.7.0 h1:9uLlrd5T46OXs5qpp8L/MTltk0zikUGi0sNNyCpA8G0=
564568
github.com/sourcegraph/go-diff v0.7.0/go.mod h1:iBszgVvyxdc8SFZ7gm69go2KDdt3ag071iBaWPF6cjs=
569+
github.com/spdx/tools-golang v0.5.7 h1:+sWcKGnhwp3vLdMqPcLdA6QK679vd86cK9hQWH3AwCg=
570+
github.com/spdx/tools-golang v0.5.7/go.mod h1:jg7w0LOpoNAw6OxKEzCoqPC2GCTj45LyTlVmXubDsYw=
565571
github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I=
566572
github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg=
567573
github.com/spf13/cast v1.5.0 h1:rj3WzYc11XZaIZMPKmwP96zkFEnnAmV8s6XbB2aY32w=

0 commit comments

Comments
 (0)