Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"os"
"os/exec"
Expand All @@ -13,23 +14,27 @@ import (
)

func main() {
response := processRepository()
numProcessors := flag.Int("num-processors", 0, "Number of parallel scc workers (0 = scc default, 1 = minimum for large repos)")
flag.Parse()

response := processRepository(*numProcessors)
outputJSON(response)

// Always exit with code 0 - status details are in JSON response
}

// processRepository handles the main logic and returns a StandardResponse
func processRepository() StandardResponse {
func processRepository(numProcessors int) StandardResponse {
ctx := context.Background()

// Get target path from command line argument
// Get target path from remaining non-flag arguments
args := flag.Args()
var targetPath string
if len(os.Args) > 1 {
targetPath = os.Args[1]
if len(args) > 0 {
targetPath = args[0]
} else {
errorCode := ErrorCodeInvalidArguments
errorMessage := fmt.Sprintf("Usage: %s <target-path>", os.Args[0])
errorMessage := fmt.Sprintf("Usage: %s [--num-processors N] <target-path>", os.Args[0])
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
Expand All @@ -51,17 +56,25 @@ func processRepository() StandardResponse {
// Process single repository (the target path argument)
repoDir := config.TargetPath

insightsDb, err := NewInsightsDB(ctx, config.InsightsDatabase)
if err != nil {
errorCode := ErrorCodeDatabaseConnection
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", err)
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
ErrorMessage: &errorMessage,
dryRun := os.Getenv("IS_PROD_ENV") != "true"

var insightsDb *InsightsDB
if !dryRun {
var dbErr error
insightsDb, dbErr = NewInsightsDB(ctx, config.InsightsDatabase)
if dbErr != nil {
errorCode := ErrorCodeDatabaseConnection
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", dbErr)
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
ErrorMessage: &errorMessage,
}
}
defer insightsDb.Close()
} else {
fmt.Println("[DRY RUN] Skipping database connection")
}
defer insightsDb.Close()

// Get git URL for the repository
gitUrl, err := getGitRepositoryURL(repoDir)
Expand All @@ -76,7 +89,7 @@ func processRepository() StandardResponse {
}

// Process the repository with SCC
report, err := getSCCReport(config.SCCPath, repoDir)
report, err := getSCCReport(config.SCCPath, repoDir, numProcessors)
if err != nil {
errorCode := getErrorCodeFromSCCError(err)
errorMessage := fmt.Sprintf("Error processing repository '%s': %v", repoDir, err)
Expand All @@ -88,24 +101,32 @@ func processRepository() StandardResponse {
}
report.Repository.URL = gitUrl

// Save to database
if err := insightsDb.saveProjectCost(ctx, report.Repository, report.Cocomo.CostInDollars); err != nil {
errorCode := ErrorCodeDatabaseOperation
errorMessage := fmt.Sprintf("Error saving project cost: %v", err)
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
ErrorMessage: &errorMessage,
if dryRun {
fmt.Printf("[DRY RUN] Would save project cost: repo=%s cost=$%.2f\n", report.Repository.URL, report.Cocomo.CostInDollars)
fmt.Printf("[DRY RUN] Would save %d language stats entries\n", len(report.LanguageStats))
for _, ls := range report.LanguageStats {
fmt.Printf("[DRY RUN] language=%s lines=%d code=%d\n", ls.LanguageName, ls.Lines, ls.Code)
}
} else {
// Save to database
if err := insightsDb.saveProjectCost(ctx, report.Repository, report.Cocomo.CostInDollars); err != nil {
errorCode := ErrorCodeDatabaseOperation
errorMessage := fmt.Sprintf("Error saving project cost: %v", err)
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
ErrorMessage: &errorMessage,
}
}
}

if err := insightsDb.saveLanguageStats(ctx, report.Repository, report.LanguageStats); err != nil {
errorCode := ErrorCodeDatabaseOperation
errorMessage := fmt.Sprintf("Error saving language stats: %v", err)
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
ErrorMessage: &errorMessage,
if err := insightsDb.saveLanguageStats(ctx, report.Repository, report.LanguageStats); err != nil {
errorCode := ErrorCodeDatabaseOperation
errorMessage := fmt.Sprintf("Error saving language stats: %v", err)
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
ErrorMessage: &errorMessage,
}
}
}

Expand All @@ -120,10 +141,10 @@ func processRepository() StandardResponse {


// getSCCReport analyzes a directory with scc and returns a report containing the estimated cost and language statistics.
func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
cost, err := getCost(sccPath, dirPath)
func getSCCReport(sccPath, dirPath string, numProcessors int) (SCCReport, error) {
cost, err := getCost(sccPath, dirPath, numProcessors)
if err != nil {
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v\"", err)
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v", dirPath, err)
}

// Skip saving to database if cost is 0 - do we want to do this?
Expand All @@ -133,7 +154,7 @@ func getSCCReport(sccPath, dirPath string) (SCCReport, error) {

projectPath := filepath.Base(dirPath)

langStats, err := getLanguageStats(sccPath, dirPath)
langStats, err := getLanguageStats(sccPath, dirPath, numProcessors)
if err != nil {
return SCCReport{}, fmt.Errorf("error getting language stats for '%s': %v", dirPath, err)
}
Expand Down Expand Up @@ -177,8 +198,8 @@ func getGitRepositoryURL(dirPath string) (string, error) {
}

// getCost runs the scc command and parses the output to get the estimated cost.
func getCost(sccPathPath, repoPath string) (float64, error) {
output, err := runSCC(sccPathPath, "--format=short", repoPath)
func getCost(sccPathPath, repoPath string, numProcessors int) (float64, error) {
output, err := runSCC(sccPathPath, numProcessors, "--format=short", repoPath)
if err != nil {
return 0, fmt.Errorf("failed to run scc command: %w", err)
}
Expand All @@ -192,8 +213,8 @@ func getCost(sccPathPath, repoPath string) (float64, error) {
}

// getLanguageStats runs the scc command and parses the output to get language statistics.
func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
output, err := runSCC(sccPathPath, "--format=json", repoPath)
func getLanguageStats(sccPathPath, repoPath string, numProcessors int) ([]LanguageStats, error) {
output, err := runSCC(sccPathPath, numProcessors, "--format=json", repoPath)
if err != nil {
return nil, fmt.Errorf("failed to run scc command: %w", err)
}
Expand All @@ -207,8 +228,18 @@ func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
}

// runSCC executes the scc command with the given arguments and returns the output.
func runSCC(sccPathPath string, args ...string) (string, error) {
cmd := exec.Command(sccPathPath, args...)
// When numProcessors > 0, scc is run with reduced parallelism to limit memory usage on large repos.
func runSCC(sccPathPath string, numProcessors int, args ...string) (string, error) {
var cmdArgs []string
if numProcessors > 0 {
n := strconv.Itoa(numProcessors)
cmdArgs = append(cmdArgs,
"--directory-walker-job-workers", n,
"--file-process-job-workers", n,
)
}
cmdArgs = append(cmdArgs, args...)
cmd := exec.Command(sccPathPath, cmdArgs...)
output, err := cmd.Output()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import subprocess
import time
from decimal import Decimal

Expand All @@ -8,6 +9,21 @@
from crowdgit.services.base.base_service import BaseService
from crowdgit.services.utils import run_shell_command

_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB


def _get_repo_size_bytes(repo_path: str) -> int:
"""Return total disk usage of repo_path in bytes using du -sb."""
try:
result = subprocess.run(
["du", "-sb", repo_path], capture_output=True, text=True, timeout=120
)
if result.returncode == 0:
return int(result.stdout.split()[0])
except Exception:
pass
return 0


class SoftwareValueService(BaseService):
"""Service for calculating software value metrics"""
Expand All @@ -20,16 +36,30 @@ def __init__(self):
async def run(self, repo_id: str, repo_path: str) -> None:
"""
Triggers software value binary for given repo.
Results are saved into insights database directly
Results are saved into insights database directly.
For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
to avoid OOM; results are identical.
"""
start_time = time.time()
execution_status = ExecutionStatus.SUCCESS
error_code = None
error_message = None

try:
cmd = [self.software_value_executable]

repo_size = _get_repo_size_bytes(repo_path)
if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
self.logger.info(
f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "
"running scc with num-processors=1"
)
cmd += ["--num-processors", "1"]

cmd.append(repo_path)

self.logger.info("Running software value...")
output = await run_shell_command([self.software_value_executable, repo_path])
output = await run_shell_command(cmd)
self.logger.info(f"Software value output: {output}")

# Parse JSON output and extract fields from StandardResponse structure
Expand Down
1 change: 1 addition & 0 deletions services/apps/git_integration/src/crowdgit/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ def load_env_var(key: str, required=True, default=None):
STUCK_RECURRENT_REPO_TIMEOUT_HOURS = int(
load_env_var("STUCK_RECURRENT_REPO_TIMEOUT_HOURS", default="4")
)
IS_PROD_ENV: bool = load_env_var("NODE_ENV", required=False) == "production"
Loading