Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 113 additions & 31 deletions internal/detector/nodescan.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"sort"
"strconv"
"strings"
"sync"
"time"

"github.com/step-security/dev-machine-guard/internal/executor"
Expand Down Expand Up @@ -68,10 +69,16 @@ func (s *NodeScanner) runCmd(ctx context.Context, timeout time.Duration, name st
return s.exec.RunWithTimeout(ctx, timeout, name, args...)
}

// runShellCmd runs a shell command string, delegating to the logged-in user when running as root.
// Falls through to the platform-aware free function for the normal (non-delegation) path.
func (s *NodeScanner) runShellCmd(ctx context.Context, timeout time.Duration, shellCmd string) (string, string, int, error) {
// runCmdInDir runs a command from `dir`, delegating to the logged-in user when
// running as root. On Windows this bypasses cmd /c entirely (see runCmdInDir
// in shellcmd.go); RunAsUser delegation is Unix-only, so the sudo path always
// constructs a shell string.
func (s *NodeScanner) runCmdInDir(ctx context.Context, timeout time.Duration, dir, name string, args ...string) (string, string, int, error) {
if s.shouldRunAsUser() {
shellCmd := "cd " + platformShellQuote(s.exec, dir) + " && " + name
for _, a := range args {
shellCmd += " " + a
}
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
stdout, err := s.exec.RunAsUser(ctx, s.loggedInUser, shellCmd)
Expand All @@ -83,7 +90,7 @@ func (s *NodeScanner) runShellCmd(ctx context.Context, timeout time.Duration, sh
}
return stdout, "", 0, nil
}
return runShellCmd(ctx, s.exec, timeout, shellCmd)
return runCmdInDir(ctx, s.exec, timeout, dir, name, args...)
}

// checkPath checks if a binary is available, using the logged-in user's PATH when running as root.
Expand Down Expand Up @@ -166,8 +173,7 @@ func (s *NodeScanner) scanYarnGlobal(ctx context.Context) (model.NodeScanResult,
}

start := time.Now()
shellCmd := "cd " + platformShellQuote(s.exec, globalDir) + " && yarn list --json --depth=0"
stdout, stderr, exitCode, _ := s.runShellCmd(ctx, 60*time.Second, shellCmd)
stdout, stderr, exitCode, _ := s.runCmdInDir(ctx, 60*time.Second, globalDir, "yarn", "list", "--json", "--depth=0")
duration := time.Since(start).Milliseconds()

errMsg := ""
Expand Down Expand Up @@ -228,7 +234,11 @@ type projectEntry struct {
modTime int64
}

// ScanProjects finds package.json files, sorts by most recently modified, then scans.
// ScanProjects finds package.json files, sorts by most recently modified, then
// scans projects concurrently. Per-project results are cached locally; on the
// next run we skip `npm ls` for any project whose package.json and lockfile
// haven't been modified since the cached scan timestamp.
//
// Respects the size limit (default 500MB, override via STEPSEC_MAX_NODE_SCAN_BYTES).
func (s *NodeScanner) ScanProjects(ctx context.Context, searchDirs []string) []model.NodeScanResult {
// Phase 1: Discover all package.json files
Expand All @@ -254,7 +264,6 @@ func (s *NodeScanner) ScanProjects(ctx context.Context, searchDirs []string) []m
if isInsideNodeModules(projectDir) {
return nil
}
// Get modification time for sorting
modTime := int64(0)
if info, err := entry.Info(); err == nil {
modTime = info.ModTime().Unix()
Expand All @@ -269,40 +278,117 @@ func (s *NodeScanner) ScanProjects(ctx context.Context, searchDirs []string) []m
return projects[i].modTime > projects[j].modTime
})

// Phase 3: Scan in order, respecting limits
maxBytes := getMaxProjectScanBytes()
var results []model.NodeScanResult
totalSize := int64(0)

// Phase 3: Build the work plan. For each project decide whether the
// previous cached result is still valid (skip) or we need to re-scan.
cachePath := scanCachePath(s.exec)
cache := loadScanCache(cachePath)
nowUnix := time.Now().Unix()

type plan struct {
dir string
pm string
skip bool
cached model.NodeScanResult
}
plans := make([]plan, 0, len(projects))
for i, p := range projects {
if i >= maxNodeProjects {
s.log.Progress(" Reached maximum of %d projects, stopping search", maxNodeProjects)
break
}
if totalSize > maxBytes {
s.log.Progress(" Reached data size limit (%d bytes collected, limit: %d bytes)", totalSize, maxBytes)
s.log.Progress(" Skipping remaining projects (prioritized by most recently modified)")
break
pm := DetectProjectPM(s.exec, p.dir)
pl := plan{dir: p.dir, pm: pm}
if entry, ok := cache.Projects[p.dir]; ok && entry.PackageManager == pm {
lockPath := lockfileFor(s.exec, p.dir, pm)
// No lockfile means we can't trust mtime — always re-scan.
if lockPath != "" {
pkgMt := mtimeOr0(s.exec, filepath.Join(p.dir, "package.json"))
lockMt := mtimeOr0(s.exec, lockPath)
if pkgMt <= entry.LastScanUnix && lockMt <= entry.LastScanUnix {
pl.skip = true
pl.cached = entry.CachedResult
}
}
}
plans = append(plans, pl)
}

s.log.Progress(" Found project: %s", p.dir)
pm := DetectProjectPM(s.exec, p.dir)
s.log.Progress(" Package manager: %s", pm)
// Phase 4: Dispatch fresh scans concurrently. Skipped projects already
// have a result; only cache-miss/invalid entries hit the worker pool.
results := make([]model.NodeScanResult, len(plans))
for i, pl := range plans {
if pl.skip {
results[i] = pl.cached
s.log.Progress(" Skipping (unchanged): %s (%s)", pl.dir, pl.pm)
}
}

r := s.scanProject(ctx, p.dir)
resultSize := int64(len(r.RawStdoutBase64)) + int64(len(r.RawStderrBase64))
workers := scanWorkerCount(s.exec)
jobs := make(chan int, len(plans))
var wg sync.WaitGroup
for range workers {
wg.Add(1)
go func() {
defer wg.Done()
for idx := range jobs {
pl := plans[idx]
s.log.Progress(" Scanning project: %s (%s)", pl.dir, pl.pm)
results[idx] = s.scanProject(ctx, pl.dir)
}
}()
}
scanned := 0
for i, pl := range plans {
if !pl.skip {
jobs <- i
scanned++
}
}
close(jobs)
wg.Wait()
s.log.Progress(" Scanned %d projects (%d skipped via cache)", scanned, len(plans)-scanned)

if totalSize+resultSize > maxBytes {
// Phase 5: Apply the size cap in mtime-desc order (matches prior behavior)
// and update cache with freshly-scanned successful results.
maxBytes := getMaxProjectScanBytes()
final := make([]model.NodeScanResult, 0, len(plans))
totalSize := int64(0)
for i := range plans {
r := results[i]
size := int64(len(r.RawStdoutBase64)) + int64(len(r.RawStderrBase64))
if totalSize+size > maxBytes {
s.log.Progress(" Reached data size limit (%d bytes collected, limit: %d bytes)", totalSize, maxBytes)
s.log.Progress(" Skipping remaining projects (prioritized by most recently modified)")
break
}
totalSize += size
final = append(final, r)
// Only cache successful fresh scans. Failed scans should be retried.
if !plans[i].skip && r.ExitCode == 0 {
cache.Projects[plans[i].dir] = cacheEntry{
PackageManager: plans[i].pm,
LastScanUnix: nowUnix,
CachedResult: r,
}
}
}

totalSize += resultSize
results = append(results, r)
// Drop cache entries for projects no longer on disk so the cache file
// doesn't grow unboundedly across runs.
seen := make(map[string]struct{}, len(plans))
for _, pl := range plans {
seen[pl.dir] = struct{}{}
}
for dir := range cache.Projects {
if _, ok := seen[dir]; !ok {
delete(cache.Projects, dir)
}
}
if err := cache.save(cachePath); err != nil {
s.log.Progress(" Warning: failed to write scan cache: %v", err)
}

return results
return final
}

func (s *NodeScanner) scanProject(ctx context.Context, projectDir string) model.NodeScanResult {
Expand Down Expand Up @@ -343,11 +429,7 @@ func (s *NodeScanner) scanProject(ctx context.Context, projectDir string) model.
}

start := time.Now()
cmdStr := "cd " + platformShellQuote(s.exec, projectDir) + " && " + cmd
for _, a := range args {
cmdStr += " " + a
}
stdout, stderr, exitCode, _ := s.runShellCmd(ctx, 30*time.Second, cmdStr)
stdout, stderr, exitCode, _ := s.runCmdInDir(ctx, 30*time.Second, projectDir, cmd, args...)
duration := time.Since(start).Milliseconds()

errMsg := ""
Expand Down
143 changes: 143 additions & 0 deletions internal/detector/nodescan_cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package detector

import (
"encoding/json"
"os"
"path/filepath"
"runtime"
"strconv"

"github.com/step-security/dev-machine-guard/internal/executor"
"github.com/step-security/dev-machine-guard/internal/model"
)

const scanCacheVersion = 1

// cacheEntry is one project's cached scan result, used to skip re-running
// `npm/yarn/pnpm/bun ls` when neither package.json nor the lockfile has been
// modified since LastScanUnix.
type cacheEntry struct {
PackageManager string `json:"package_manager"`
LastScanUnix int64 `json:"last_scan_unix"`
CachedResult model.NodeScanResult `json:"cached_result"`
}

type scanCache struct {
Version int `json:"version"`
Projects map[string]cacheEntry `json:"projects"`
}

func newScanCache() *scanCache {
return &scanCache{Version: scanCacheVersion, Projects: map[string]cacheEntry{}}
}

// scanCachePath returns the on-disk path for the per-project scan cache.
// Override with STEPSEC_NODE_SCAN_CACHE for tests / non-root runs.
func scanCachePath(exec executor.Executor) string {
if override := exec.Getenv("STEPSEC_NODE_SCAN_CACHE"); override != "" {
return override
}
if exec.GOOS() == "windows" {
return filepath.Join(`C:\ProgramData\StepSecurity\dev-machine-guard`, "scan-cache.json")
}
return "/var/lib/stepsecurity/dev-machine-guard/scan-cache.json"
}

// loadScanCache reads the cache file. Returns an empty cache on miss or any
// parse error — a corrupt cache must never break a scan, only force a full one.
func loadScanCache(path string) *scanCache {
data, err := os.ReadFile(path)
if err != nil {
return newScanCache()
}
var c scanCache
if err := json.Unmarshal(data, &c); err != nil || c.Version != scanCacheVersion {
return newScanCache()
}
if c.Projects == nil {
c.Projects = map[string]cacheEntry{}
}
return &c
}

// save writes the cache atomically (write to tmp, rename).
func (c *scanCache) save(path string) error {
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
data, err := json.Marshal(c)
if err != nil {
return err
}
tmp, err := os.CreateTemp(dir, ".scan-cache-*.tmp")
if err != nil {
return err
}
tmpPath := tmp.Name()
if _, err := tmp.Write(data); err != nil {
_ = tmp.Close()
_ = os.Remove(tmpPath)
return err
}
if err := tmp.Close(); err != nil {
_ = os.Remove(tmpPath)
return err
}
return os.Rename(tmpPath, path)
}

// lockfileFor returns the path of the lockfile for the given package manager
// in projectDir, or "" if no expected lockfile is present.
func lockfileFor(exec executor.Executor, projectDir, pm string) string {
var names []string
switch pm {
case "npm":
names = []string{"package-lock.json"}
case "yarn", "yarn-berry":
names = []string{"yarn.lock"}
case "pnpm":
names = []string{"pnpm-lock.yaml"}
case "bun":
names = []string{"bun.lock", "bun.lockb"}
default:
return ""
}
for _, n := range names {
p := filepath.Join(projectDir, n)
if exec.FileExists(p) {
return p
}
}
return ""
}

// mtimeOr0 returns the file's mtime in unix seconds, or 0 if it can't be stat'd.
func mtimeOr0(exec executor.Executor, path string) int64 {
if path == "" {
return 0
}
info, err := exec.Stat(path)
if err != nil {
return 0
}
return info.ModTime().Unix()
}

// scanWorkerCount returns the number of concurrent project scans to dispatch.
// Defaults to min(NumCPU, 8). Override with STEPSEC_NODE_SCAN_WORKERS.
func scanWorkerCount(exec executor.Executor) int {
if v := exec.Getenv("STEPSEC_NODE_SCAN_WORKERS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
return n
}
}
n := runtime.NumCPU()
if n > 8 {
n = 8
}
if n < 1 {
n = 1
}
return n
}
Loading