Skip to content
72 changes: 72 additions & 0 deletions src/bin/cmd-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { Source } from "../sources/types.js";
import { FilesystemStore } from "../stores/filesystem.js";
import { getS3Config } from "../stores/s3-config.js";
import { buildClientUserAgent } from "../core/utils.js";
import { parseSourceUrl } from "../core/url-parser.js";

// Shared store options
interface StoreOptions {
Expand Down Expand Up @@ -206,9 +207,80 @@ websiteCommand.action(async (options) => {
}
});

// URL-based indexing command (auto-detects source type)
const urlCommand = new Command("url")
.description("Index from URL with auto-detection (used internally when URL is passed directly)")
.argument("<url>", "URL of the repository or website to index")
.option("--ref <ref>", "Branch, tag, or commit (overrides URL-detected ref)");
addStoreOptions(urlCommand);
urlCommand.action(async (url: string, options) => {
try {
// Parse the URL to determine source type and config
const parsed = parseSourceUrl(url);
const indexKey = options.index || parsed.defaultIndexName;

let source: Source;

switch (parsed.type) {
case "github": {
const { GitHubSource } = await import("../sources/github.js");
const config = parsed.config as import("../sources/github.js").GitHubSourceConfig;
source = new GitHubSource({
...config,
ref: options.ref || config.ref,
});
break;
}
case "gitlab": {
const { GitLabSource } = await import("../sources/gitlab.js");
const config = parsed.config as import("../sources/gitlab.js").GitLabSourceConfig;
source = new GitLabSource({
...config,
ref: options.ref || config.ref,
});
break;
}
case "bitbucket": {
const { BitBucketSource } = await import("../sources/bitbucket.js");
const config = parsed.config as import("../sources/bitbucket.js").BitBucketSourceConfig;
source = new BitBucketSource({
...config,
ref: options.ref || config.ref,
});
break;
}
case "website": {
const { WebsiteSource } = await import("../sources/website.js");
const config = parsed.config as import("../sources/website.js").WebsiteSourceConfig;
source = new WebsiteSource(config);
break;
}
default:
throw new Error(`Unknown source type: ${parsed.type}`);
}

const store = await createStore(options);
await runIndex(source, store, indexKey, parsed.type);
} catch (error) {
if (error instanceof Error && error.message.includes("Invalid")) {
console.error(`Error parsing URL: ${error.message}`);
} else {
console.error("Indexing failed:", error);
}
process.exit(1);
}
});

// Main index command
export const indexCommand = new Command("index")
.usage("<url | source> [options]")
.description("Index a data source")
.addHelpText('after', `
Examples:
ctxc index https://github.com/owner/repo
ctxc index https://github.com/owner/repo -i myindex
ctxc index github --owner x --repo y`)
.addCommand(urlCommand, { hidden: true })
.addCommand(githubCommand)
.addCommand(gitlabCommand)
.addCommand(bitbucketCommand)
Expand Down
12 changes: 11 additions & 1 deletion src/bin/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,15 @@ program.addCommand(searchCommand);
program.addCommand(mcpCommand);
program.addCommand(agentCommand);

program.parse();
// Auto-detect URL mode: ctxc index <url> -> ctxc index url <url>
// URL must be the first argument after 'index' (like any subcommand)
const indexIdx = process.argv.indexOf("index");

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The URL auto-rewrite only triggers when the URL is the first argument after index, so ctxc index -i myidx https://… won’t be rewritten and will likely error. Is that limitation intentional, or should the rewrite scan forward for the first non-option arg?

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is intentional. The URL is treated like a positional argument (similar to how git clone <url> works), so it must come first after index.

The supported patterns are:

ctxc index https://github.com/owner/repo           # URL first
ctxc index https://github.com/owner/repo -i myidx  # Options after URL

Scanning forward for non-option args would add complexity and could lead to ambiguous parsing in edge cases. The current behavior is consistent with how most CLI tools handle positional arguments.

if (indexIdx !== -1 && indexIdx + 1 < process.argv.length) {
const nextArg = process.argv[indexIdx + 1];
if (nextArg.match(/^https?:\/\//)) {
// Insert 'url' before the URL
process.argv.splice(indexIdx + 1, 0, "url");
}
}

program.parse();
3 changes: 3 additions & 0 deletions src/core/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ export type { ClientProduct, MCPClientInfo } from "./utils.js";
export { Indexer } from "./indexer.js";
export type { IndexerConfig } from "./indexer.js";

export { parseSourceUrl } from "./url-parser.js";
export type { ParsedUrl } from "./url-parser.js";

220 changes: 220 additions & 0 deletions src/core/url-parser.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import { describe, it, expect } from "vitest";
import { parseSourceUrl } from "./url-parser.js";

describe("parseSourceUrl", () => {
describe("GitHub URLs", () => {
it("parses basic github.com URL", () => {
const result = parseSourceUrl("https://github.com/owner/repo");
expect(result.type).toBe("github");
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "HEAD" });
expect(result.defaultIndexName).toBe("repo");
});

it("parses GitHub URL with tree/branch", () => {
const result = parseSourceUrl("https://github.com/owner/repo/tree/main");
expect(result.type).toBe("github");
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "main" });
expect(result.defaultIndexName).toBe("repo");
});

it("parses GitHub URL with tree/feature/branch (slashes in branch name)", () => {
const result = parseSourceUrl("https://github.com/owner/repo/tree/feature/branch");
expect(result.type).toBe("github");
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "feature/branch" });
expect(result.defaultIndexName).toBe("repo");
});

it("parses GitHub URL with commit SHA", () => {
const result = parseSourceUrl("https://github.com/owner/repo/commit/abc123def456");
expect(result.type).toBe("github");
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "abc123def456" });
expect(result.defaultIndexName).toBe("repo");
});

it("throws on invalid GitHub URL without repo", () => {
expect(() => parseSourceUrl("https://github.com/owner")).toThrow("Invalid GitHub URL");
});
});

describe("GitLab URLs", () => {
it("parses basic gitlab.com URL", () => {
const result = parseSourceUrl("https://gitlab.com/group/project");
expect(result.type).toBe("gitlab");
expect(result.config).toEqual({ projectId: "group/project", ref: "HEAD", baseUrl: undefined });
expect(result.defaultIndexName).toBe("project");
});

it("parses GitLab URL with subgroups", () => {
const result = parseSourceUrl("https://gitlab.com/group/subgroup/project");
expect(result.type).toBe("gitlab");
expect(result.config).toEqual({
projectId: "group/subgroup/project",
ref: "HEAD",
baseUrl: undefined,
});
expect(result.defaultIndexName).toBe("project");
});

it("parses GitLab URL with /-/tree/branch", () => {
const result = parseSourceUrl("https://gitlab.com/group/project/-/tree/main");
expect(result.type).toBe("gitlab");
expect(result.config).toEqual({ projectId: "group/project", ref: "main", baseUrl: undefined });
expect(result.defaultIndexName).toBe("project");
});

it("parses GitLab URL with /-/tree/feature/branch", () => {
const result = parseSourceUrl("https://gitlab.com/group/project/-/tree/feature/branch");
expect(result.type).toBe("gitlab");
expect(result.config).toEqual({
projectId: "group/project",
ref: "feature/branch",
baseUrl: undefined,
});
});

it("parses self-hosted GitLab URL", () => {
const result = parseSourceUrl("https://gitlab.mycompany.com/team/project");
expect(result.type).toBe("gitlab");
expect(result.config).toEqual({
projectId: "team/project",
ref: "HEAD",
baseUrl: "https://gitlab.mycompany.com",
});
expect(result.defaultIndexName).toBe("project");
});

it("throws on invalid GitLab URL", () => {
expect(() => parseSourceUrl("https://gitlab.com/group")).toThrow("Invalid GitLab URL");
});
});

describe("Bitbucket URLs", () => {
it("parses basic bitbucket.org URL", () => {
const result = parseSourceUrl("https://bitbucket.org/workspace/repo");
expect(result.type).toBe("bitbucket");
expect(result.config).toEqual({
workspace: "workspace",
repo: "repo",
ref: "HEAD",
baseUrl: undefined,
});
expect(result.defaultIndexName).toBe("repo");
});

it("parses Bitbucket URL with /src/branch", () => {
const result = parseSourceUrl("https://bitbucket.org/workspace/repo/src/main");
expect(result.type).toBe("bitbucket");
expect(result.config).toEqual({
workspace: "workspace",
repo: "repo",
ref: "main",
baseUrl: undefined,
});
});

it("parses Bitbucket URL with /branch/feature", () => {
const result = parseSourceUrl("https://bitbucket.org/workspace/repo/branch/feature");
expect(result.type).toBe("bitbucket");
expect(result.config).toEqual({
workspace: "workspace",
repo: "repo",
ref: "feature",
baseUrl: undefined,
});
});

it("parses self-hosted Bitbucket URL", () => {
const result = parseSourceUrl("https://bitbucket.mycompany.com/workspace/repo");
expect(result.type).toBe("bitbucket");
expect(result.config).toEqual({
workspace: "workspace",
repo: "repo",
ref: "HEAD",
baseUrl: "https://bitbucket.mycompany.com",
});
});

it("throws on invalid Bitbucket URL", () => {
expect(() => parseSourceUrl("https://bitbucket.org/workspace")).toThrow("Invalid Bitbucket URL");
});
});

describe("Website URLs (fallback)", () => {
it("parses unknown URL as website", () => {
const result = parseSourceUrl("https://docs.example.com/api/v2");
expect(result.type).toBe("website");
expect(result.config).toEqual({ url: "https://docs.example.com/api/v2" });
expect(result.defaultIndexName).toBe("docs.example.com");
});

it("uses hostname as default index name for website", () => {
const result = parseSourceUrl("https://react.dev/learn/thinking-in-react");
expect(result.type).toBe("website");
expect(result.defaultIndexName).toBe("react.dev");
});
});

describe("Invalid URLs", () => {
it("throws on invalid URL format", () => {
expect(() => parseSourceUrl("not-a-url")).toThrow();
});
});
});


describe("Edge cases", () => {
describe(".git suffix handling", () => {
it("strips .git suffix from GitHub URLs", () => {
const result = parseSourceUrl("https://github.com/owner/repo.git");
expect(result.type).toBe("github");
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "HEAD" });
expect(result.defaultIndexName).toBe("repo");
});

it("strips .git suffix from GitLab URLs", () => {
const result = parseSourceUrl("https://gitlab.com/group/project.git");
expect(result.type).toBe("gitlab");
expect(result.config).toEqual({ projectId: "group/project", ref: "HEAD", baseUrl: undefined });
expect(result.defaultIndexName).toBe("project");
});

it("strips .git suffix from Bitbucket URLs", () => {
const result = parseSourceUrl("https://bitbucket.org/workspace/repo.git");
expect(result.type).toBe("bitbucket");
expect(result.config).toEqual({
workspace: "workspace",
repo: "repo",
ref: "HEAD",
baseUrl: undefined,
});
expect(result.defaultIndexName).toBe("repo");
});
});

describe("Conservative self-hosted detection", () => {
it("detects gitlab.company.com as GitLab", () => {
const result = parseSourceUrl("https://gitlab.company.com/team/project");
expect(result.type).toBe("gitlab");
});

it("does NOT match notgitlab.com as GitLab", () => {
const result = parseSourceUrl("https://notgitlab.com/some/path");
expect(result.type).toBe("website");
});

it("does NOT match mygitlabserver.com as GitLab", () => {
const result = parseSourceUrl("https://mygitlabserver.com/some/path");
expect(result.type).toBe("website");
});

it("detects bitbucket.company.com as Bitbucket", () => {
const result = parseSourceUrl("https://bitbucket.company.com/workspace/repo");
expect(result.type).toBe("bitbucket");
});

it("does NOT match notbitbucket.org as Bitbucket", () => {
const result = parseSourceUrl("https://notbitbucket.org/some/path");
expect(result.type).toBe("website");
});
});
});
Loading