Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed
- Changed `/api/source` api to support fetching source code for any revision, not just revisions that are indexed by zoekt. [#829](https://github.com/sourcebot-dev/sourcebot/pull/829)

## [4.10.20] - 2026-01-28

### Fixed
Expand Down
1 change: 1 addition & 0 deletions packages/backend/src/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export type OctokitRepository = {
stargazers_count?: number,
watchers_count?: number,
subscribers_count?: number,
default_branch?: string,
forks_count?: number,
archived?: boolean,
topics?: string[],
Expand Down
1 change: 1 addition & 0 deletions packages/backend/src/repoCompileUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ vi.mock('./git.js', () => ({
isPathAValidGitRepoRoot: vi.fn(),
getOriginUrl: vi.fn(),
isUrlAValidGitRepo: vi.fn(),
getLocalDefaultBranch: vi.fn(),
}));

// Mock the glob module
Expand Down
20 changes: 18 additions & 2 deletions packages/backend/src/repoCompileUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import { BitbucketConnectionConfig, GerritConnectionConfig, GiteaConnectionConfi
import { ProjectVisibility } from "azure-devops-node-api/interfaces/CoreInterfaces.js";
import path from 'path';
import { glob } from 'glob';
import { getOriginUrl, isPathAValidGitRepoRoot, isUrlAValidGitRepo } from './git.js';
import { getLocalDefaultBranch, getOriginUrl, isPathAValidGitRepoRoot, isUrlAValidGitRepo } from './git.js';
import assert from 'assert';
import GitUrlParse from 'git-url-parse';
import { RepoMetadata } from '@sourcebot/shared';
Expand Down Expand Up @@ -118,6 +118,7 @@ export const createGitHubRepoRecord = ({
cloneUrl: cloneUrl.toString(),
webUrl: repo.html_url,
name: repoName,
defaultBranch: repo.default_branch,
displayName: repoDisplayName,
imageUrl: repo.owner.avatar_url,
isFork: repo.fork,
Expand Down Expand Up @@ -185,6 +186,7 @@ export const compileGitlabConfig = async (
cloneUrl: cloneUrl.toString(),
webUrl: projectUrl,
name: repoName,
defaultBranch: project.default_branch,
displayName: repoDisplayName,
imageUrl: avatarUrl,
isFork: isFork,
Expand Down Expand Up @@ -257,6 +259,7 @@ export const compileGiteaConfig = async (
webUrl: repo.html_url,
name: repoName,
displayName: repoDisplayName,
defaultBranch: repo.default_branch,
imageUrl: repo.owner?.avatar_url,
isFork: repo.fork!,
isPublic: isPublic,
Expand Down Expand Up @@ -339,6 +342,10 @@ export const compileGerritConfig = async (
webUrl: webUrl,
name: repoName,
displayName: repoDisplayName,
// @note: the gerrit api doesn't return the default branch (without a seperate query).
// Instead, the default branch will be set once the repo is cloned.
// @see: repoIndexManager.ts
defaultBranch: undefined,
isFork: false,
isArchived: false,
org: {
Expand Down Expand Up @@ -444,6 +451,7 @@ export const compileBitbucketConfig = async (
const repoName = path.join(repoNameRoot, displayName);
const cloneUrl = getCloneUrl(repo);
const webUrl = getWebUrl(repo);
const defaultBranch = isServer ? (repo as BitbucketServerRepository).defaultBranch : (repo as BitbucketCloudRepository).mainbranch?.name;

const record: RepoData = {
external_id: externalId,
Expand All @@ -453,6 +461,7 @@ export const compileBitbucketConfig = async (
webUrl: webUrl,
name: repoName,
displayName: displayName,
defaultBranch,
isFork: isFork,
isPublic: isPublic,
isArchived: isArchived,
Expand Down Expand Up @@ -557,6 +566,8 @@ export const compileGenericGitHostConfig_file = async (

const remoteUrl = GitUrlParse(origin);

const defaultBranch = await getLocalDefaultBranch({ path: repoPath });

// @note: matches the naming here:
// https://github.com/sourcebot-dev/zoekt/blob/main/gitindex/index.go#L293
// Go's url.URL.Host includes the port if present (even default ports like 443),
Expand All @@ -573,6 +584,7 @@ export const compileGenericGitHostConfig_file = async (
cloneUrl: `file://${repoPath}`,
name: repoName,
displayName: repoName,
defaultBranch,
isFork: false,
isArchived: false,
org: {
Expand Down Expand Up @@ -612,7 +624,6 @@ export const compileGenericGitHostConfig_file = async (
}
}


export const compileGenericGitHostConfig_url = async (
config: GenericGitHostConnectionConfig,
connectionId: number,
Expand Down Expand Up @@ -645,6 +656,10 @@ export const compileGenericGitHostConfig_url = async (
cloneUrl: remoteUrl.toString(),
name: repoName,
displayName: repoName,
// @note: we can't determine the default branch from the remote url.
// Instead, the default branch will be set once the repo is cloned.
// @see: repoIndexManager.ts
defaultBranch: undefined,
isFork: false,
isArchived: false,
org: {
Expand Down Expand Up @@ -719,6 +734,7 @@ export const compileAzureDevOpsConfig = async (
webUrl: webUrl,
name: repoName,
displayName: repoDisplayName,
defaultBranch: repo.defaultBranch,
imageUrl: null,
isFork: !!repo.isFork,
isArchived: false,
Expand Down
8 changes: 8 additions & 0 deletions packages/backend/src/repoIndexManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ export class RepoIndexManager {
});

const pushedAt = await getLatestCommitTimestamp({ path: repoPath });
const defaultBranch = await getLocalDefaultBranch({ path: repoPath });

const jobMetadata = repoIndexingJobMetadataSchema.parse(jobData.metadata);

Expand All @@ -511,6 +512,13 @@ export class RepoIndexManager {
...(jobData.repo.metadata as RepoMetadata),
indexedRevisions: jobMetadata.indexedRevisions,
} satisfies RepoMetadata,
// @note: always update the default branch. While this field can be set
// during connection syncing, by setting it here we ensure that a) the
// default branch is as up to date as possible (since repo indexing happens
// more frequently than connection syncing) and b) for hosts where it is
// impossible to determine the default branch from the host's API
// (e.g., generic git url), we still set the default branch here.
defaultBranch: defaultBranch,
}
});

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "Repo" ADD COLUMN "defaultBranch" TEXT;
1 change: 1 addition & 0 deletions packages/db/prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ model Repo {
webUrl String?
connections RepoToConnection[]
imageUrl String?
defaultBranch String?

permittedAccounts AccountToRepoPermission[]
permissionSyncJobs RepoPermissionSyncJob[]
Expand Down
1 change: 1 addition & 0 deletions packages/web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
"input-otp": "^1.4.2",
"langfuse": "^3.38.4",
"langfuse-vercel": "^3.38.4",
"linguist-languages": "^9.3.1",
"lucide-react": "^0.517.0",
"micromatch": "^4.0.8",
"next": "15.5.9",
Expand Down
119 changes: 52 additions & 67 deletions packages/web/src/features/search/fileSourceApi.ts
Original file line number Diff line number Diff line change
@@ -1,85 +1,70 @@
import 'server-only';
import { fileNotFound, ServiceError, unexpectedError } from "../../lib/serviceError";
import { fileNotFound, notFound, ServiceError, unexpectedError } from "../../lib/serviceError";
import { FileSourceRequest, FileSourceResponse } from "./types";
import { isServiceError } from "../../lib/utils";
import { search } from "./searchApi";
import { sew } from "@/actions";
import { withOptionalAuthV2 } from "@/withAuthV2";
import { QueryIR } from './ir';
import escapeStringRegexp from "escape-string-regexp";
import { getRepoPath } from '@sourcebot/shared';
import { simpleGit } from 'simple-git';
import { detectLanguageFromFilename } from "@/lib/languageDetection";
import { getBrowsePath } from "@/app/[domain]/browse/hooks/utils";
import { getCodeHostBrowseFileAtBranchUrl } from "@/lib/utils";
import { SINGLE_TENANT_ORG_DOMAIN } from "@/lib/constants";

// @todo (bkellam) #574 : We should really be using `git show <hash>:<path>` to fetch file contents here.
// This will allow us to support permalinks to files at a specific revision that may not be indexed
// by zoekt. We should also refactor this out of the /search folder.

export const getFileSource = async ({ path, repo, ref }: FileSourceRequest): Promise<FileSourceResponse | ServiceError> => sew(() =>
withOptionalAuthV2(async () => {
const query: QueryIR = {
and: {
children: [
{
repo: {
regexp: `^${escapeStringRegexp(repo)}$`,
},
},
{
substring: {
pattern: path,
case_sensitive: true,
file_name: true,
content: false,
}
},
...(ref ? [{
branch: {
pattern: ref,
exact: true,
},
}]: [])
]
}
}

const searchResponse = await search({
queryType: 'ir',
query,
options: {
matches: 1,
whole: true,
}
export const getFileSource = async ({ path: filePath, repo: repoName, ref }: FileSourceRequest): Promise<FileSourceResponse | ServiceError> => sew(() =>
withOptionalAuthV2(async ({ org, prisma }) => {
const repo = await prisma.repo.findFirst({
where: { name: repoName, orgId: org.id },
});

if (isServiceError(searchResponse)) {
return searchResponse;
if (!repo) {
return notFound(`Repository "${repoName}" not found.`);
}

const files = searchResponse.files;

if (!files || files.length === 0) {
return fileNotFound(path, repo);
}
const { path: repoPath } = getRepoPath(repo);
const git = simpleGit().cwd(repoPath);

const file = files[0];
const source = file.content ?? '';
const language = file.language;
const gitRef = ref ??
repo.defaultBranch ??
'HEAD';

const repoInfo = searchResponse.repositoryInfo.find((repo) => repo.id === file.repositoryId);
if (!repoInfo) {
// This should never happen.
return unexpectedError("Repository info not found");
let source: string;
try {
source = await git.raw(['show', `${gitRef}:${filePath}`]);
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : String(error);
if (errorMessage.includes('does not exist') || errorMessage.includes('fatal: path')) {
return fileNotFound(filePath, repoName);
}
if (errorMessage.includes('unknown revision') || errorMessage.includes('bad revision') || errorMessage.includes('invalid object name')) {
return unexpectedError(`Invalid git reference: ${gitRef}`);
}
throw error;
}

const language = detectLanguageFromFilename(filePath);
const webUrl = getBrowsePath({
repoName: repo.name,
revisionName: ref,
path: filePath,
pathType: 'blob',
domain: SINGLE_TENANT_ORG_DOMAIN,
});
const externalWebUrl = getCodeHostBrowseFileAtBranchUrl({
webUrl: repo.webUrl,
codeHostType: repo.external_codeHostType,
branchName: gitRef,
filePath,
});
Comment on lines +51 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

cat -n packages/web/src/features/search/fileSourceApi.ts | head -80

Repository: sourcebot-dev/sourcebot

Length of output: 3321


🏁 Script executed:

# Find where gitRef is defined and how it defaults to 'HEAD'
rg -n "gitRef" packages/web/src/features/search/fileSourceApi.ts -B5 -A2

Repository: sourcebot-dev/sourcebot

Length of output: 1251


🏁 Script executed:

# Find the getCodeHostBrowseFileAtBranchUrl function definition
fd . packages/web packages/backend -name "*.ts" -type f | xargs grep -l "getCodeHostBrowseFileAtBranchUrl" | head -5

Repository: sourcebot-dev/sourcebot

Length of output: 297


🏁 Script executed:

# Search for getCodeHostBrowseFileAtBranchUrl function definition
rg -n "getCodeHostBrowseFileAtBranchUrl" packages/web packages/backend -t ts -B2 -A15 | grep -A15 "function\|export\|const getCodeHostBrowseFileAtBranchUrl"

Repository: sourcebot-dev/sourcebot

Length of output: 2341


🏁 Script executed:

# Also find the exact file containing this function
fd . packages/web packages/backend -name "*.ts" -type f | xargs grep -l "getCodeHostBrowseFileAtBranchUrl"

Repository: sourcebot-dev/sourcebot

Length of output: 299


🏁 Script executed:

sed -n '376,450p' packages/web/src/lib/utils.ts

Repository: sourcebot-dev/sourcebot

Length of output: 2283


🏁 Script executed:

# Check for tests related to this function
rg -n "getCodeHostBrowseFileAtBranchUrl" packages/ --type ts -g "*.test.*"

Repository: sourcebot-dev/sourcebot

Length of output: 49


🏁 Script executed:

# Check if there's any handling of 'HEAD' in external URL generation
rg -n "'HEAD'" packages/web/src/lib/utils.ts -B2 -A2

Repository: sourcebot-dev/sourcebot

Length of output: 49


🏁 Script executed:

# Check if ref being undefined is actually a valid case (check fileSourceApi usage/tests)
fd . packages/ -name "*.test.ts" -o -name "*.spec.ts" | xargs grep -l "fileSourceApi\|getFileSource" 2>/dev/null | head -3

Repository: sourcebot-dev/sourcebot

Length of output: 297


🏁 Script executed:

# Search for test files containing fileSourceApi
find packages -type f \( -name "*.test.ts" -o -name "*.spec.ts" \)

Repository: sourcebot-dev/sourcebot

Length of output: 746


🏁 Script executed:

# Check how ref parameter is used in FileSourceRequest type
rg -n "FileSourceRequest" packages/web/src/features/search/types.ts -A10

Repository: sourcebot-dev/sourcebot

Length of output: 505


🏁 Script executed:

# Find the fileSourceRequestSchema definition
rg -n "fileSourceRequestSchema" packages/web/src/features/search/types.ts -B5 -A5

Repository: sourcebot-dev/sourcebot

Length of output: 585


External URL will be invalid when ref is undefined and gitRef defaults to 'HEAD'.

When ref is not provided, gitRef becomes 'HEAD', which is then passed directly to getCodeHostBrowseFileAtBranchUrl as branchName. However, major code hosts (GitHub, GitLab, Gitea, Azure DevOps, Bitbucket) do not support 'HEAD' as a valid reference in browse URLs—they require actual branch names or commit SHAs. This results in broken external URLs.

Consider either resolving 'HEAD' to the actual default branch name before passing it to the function, or returning undefined for externalWebUrl when ref is not explicitly provided.

🤖 Prompt for AI Agents
In `@packages/web/src/features/search/fileSourceApi.ts` around lines 49 - 54,
externalWebUrl is built using gitRef which defaults to 'HEAD' when ref is
undefined, producing invalid browse URLs; update the logic around
getCodeHostBrowseFileAtBranchUrl: if ref is undefined, do not pass 'HEAD' —
either resolve the repository's default branch (use repo.default_branch or
repo.defaultBranch if present) and use that as branchName, or explicitly set
externalWebUrl to undefined when no explicit ref is provided; modify the code
that computes gitRef/externalWebUrl to check for ref === undefined and branch
resolution via repo.default_branch before calling
getCodeHostBrowseFileAtBranchUrl (referencing gitRef,
repo.default_branch/repo.defaultBranch, and externalWebUrl).


return {
source,
language,
path,
repo,
repoCodeHostType: repoInfo.codeHostType,
repoDisplayName: repoInfo.displayName,
repoExternalWebUrl: repoInfo.webUrl,
path: filePath,
repo: repoName,
repoCodeHostType: repo.external_codeHostType,
repoDisplayName: repo.displayName ?? undefined,
repoExternalWebUrl: repo.webUrl ?? undefined,
branch: ref,
webUrl: file.webUrl,
externalWebUrl: file.externalWebUrl,
webUrl,
externalWebUrl,
} satisfies FileSourceResponse;

}));
39 changes: 39 additions & 0 deletions packages/web/src/lib/languageDetection.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import * as linguistLanguages from 'linguist-languages';
import path from 'path';

const extensionToLanguage = new Map<string, string>();

for (const [languageName, languageData] of Object.entries(linguistLanguages)) {
if ('extensions' in languageData && languageData.extensions) {
for (const ext of languageData.extensions) {
const normalizedExt = ext.toLowerCase();
if (!extensionToLanguage.has(normalizedExt)) {
extensionToLanguage.set(normalizedExt, languageName);
}
}
}
if ('filenames' in languageData && languageData.filenames) {
for (const filename of languageData.filenames) {
if (!extensionToLanguage.has(filename)) {
extensionToLanguage.set(filename, languageName);
}
}
}
}

export const detectLanguageFromFilename = (filename: string): string => {
const basename = path.basename(filename);

// Check for exact filename match (e.g., Makefile, Dockerfile)
if (extensionToLanguage.has(basename)) {
return extensionToLanguage.get(basename)!;
}

// Check for extension match
const ext = path.extname(filename).toLowerCase();
if (ext && extensionToLanguage.has(ext)) {
return extensionToLanguage.get(ext)!;
}

return '';
};
2 changes: 1 addition & 1 deletion packages/web/src/lib/serviceError.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ export const invalidZoektResponse = async (zoektResponse: Response): Promise<Ser
};
}

export const fileNotFound = async (fileName: string, repository: string): Promise<ServiceError> => {
export const fileNotFound = (fileName: string, repository: string): ServiceError => {
return {
statusCode: StatusCodes.NOT_FOUND,
errorCode: ErrorCode.FILE_NOT_FOUND,
Expand Down
8 changes: 8 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8395,6 +8395,7 @@ __metadata:
jsdom: "npm:^25.0.1"
langfuse: "npm:^3.38.4"
langfuse-vercel: "npm:^3.38.4"
linguist-languages: "npm:^9.3.1"
lucide-react: "npm:^0.517.0"
micromatch: "npm:^4.0.8"
next: "npm:15.5.9"
Expand Down Expand Up @@ -15018,6 +15019,13 @@ __metadata:
languageName: node
linkType: hard

"linguist-languages@npm:^9.3.1":
version: 9.3.1
resolution: "linguist-languages@npm:9.3.1"
checksum: 10c0/41d5c16b9f7095310003598f4568254ac9736fc6f67daa1f62a11ae9aaf6acc847451675dbb8387b70ed8daaef75656dba8c8057ae93e07152304f3c27aa7440
languageName: node
linkType: hard

"linkify-it@npm:^5.0.0":
version: 5.0.0
resolution: "linkify-it@npm:5.0.0"
Expand Down