Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/helpers/get-page-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -328,14 +328,21 @@ async function discoverSitemapUrls(ctx: CheckContext, originOverride?: string):

// Build fallback candidates: origin-level sitemap first, then subpath sitemaps
// when the base URL has a non-root path (e.g. swagger.io/docs/).
// Both `sitemap-index.xml` (hyphen) and `sitemap_index.xml` (underscore) are
// observed in the wild; e.g. Document360's CMS emits the underscore form.
const fallbackOrigin = originOverride ?? ctx.origin;
const candidates = [`${fallbackOrigin}/sitemap.xml`];
const candidates = [
`${fallbackOrigin}/sitemap.xml`,
`${fallbackOrigin}/sitemap-index.xml`,
`${fallbackOrigin}/sitemap_index.xml`,
];

const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
if (baseUrlPath && baseUrlPath !== '') {
const subpathBase = `${fallbackOrigin}${baseUrlPath}`;
candidates.push(`${subpathBase}/sitemap.xml`);
candidates.push(`${subpathBase}/sitemap-index.xml`);
candidates.push(`${subpathBase}/sitemap_index.xml`);
}

return candidates;
Expand Down
6 changes: 6 additions & 0 deletions test/helpers/mock-sitemap-not-found.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ export function mockSitemapNotFound(server: SetupServerApi, baseUrl: string): vo
const handlers = [
http.get(`${parsed.origin}/robots.txt`, () => new HttpResponse('', { status: 404 })),
http.get(`${parsed.origin}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`${parsed.origin}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`${parsed.origin}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
];
const subpath = parsed.pathname.replace(/\/$/, '');
if (subpath && subpath !== '') {
Expand All @@ -26,6 +28,10 @@ export function mockSitemapNotFound(server: SetupServerApi, baseUrl: string): vo
`${parsed.origin}${subpath}/sitemap-index.xml`,
() => new HttpResponse('', { status: 404 }),
),
http.get(
`${parsed.origin}${subpath}/sitemap_index.xml`,
() => new HttpResponse('', { status: 404 }),
),
);
}
server.use(...handlers);
Expand Down
2 changes: 2 additions & 0 deletions test/integration/check-pipeline.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ function setupSite(
handlers.push(
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
);

const defaultCacheHeaders = opts.cacheControl ? { 'Cache-Control': opts.cacheControl } : {};
Expand Down
2 changes: 2 additions & 0 deletions test/integration/cross-check-contracts.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ describe('previousResults safety: checks handle missing dependencies gracefully'
const ctx = createContext(`http://${host}`, { requestDelay: 0 });
// No llms-txt-exists in previousResults, no llms.txt

mockSitemapNotFound(server, `http://${host}`);
server.use(
http.get(`http://${host}/llms.txt`, () => new HttpResponse(null, { status: 404 })),
http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })),
Expand Down Expand Up @@ -368,6 +369,7 @@ describe('cross-check field contracts: empty/missing upstream details', () => {
details: { discoveredFiles: [] },
});

mockSitemapNotFound(server, `http://${host}`);
server.use(
http.get(`http://${host}/llms.txt`, () => new HttpResponse(null, { status: 404 })),
http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })),
Expand Down
2 changes: 2 additions & 0 deletions test/integration/dependency-chains.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ function setupSite(
http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })),
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
);

for (const page of opts.pages) {
Expand Down
10 changes: 10 additions & 0 deletions test/integration/scoring-pipeline.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ function setupSite(
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
);
}
handlers.push(
http.get(`http://${host}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
);

// Root URL for homepage-based discovery
const pageLinks = opts.pages
Expand Down Expand Up @@ -415,6 +419,12 @@ describe('scoring pipeline: resolutions populated for real check failures', () =
it('each failing check produces a resolution string', async () => {
const { pages } = makePages(host, 6);
setupSite(host, { pages, cacheControl: 'max-age=300' });
// No llms.txt or sitemap → discovery falls back to baseUrl, and
// markdown-url-support probes baseUrl's .md candidates.
server.use(
http.get(`http://${host}/.md`, () => new HttpResponse(null, { status: 404 })),
http.get(`http://${host}/index.md`, () => new HttpResponse(null, { status: 404 })),
);

const report = await runChecks(`http://${host}`, {
requestDelay: 0,
Expand Down
1 change: 1 addition & 0 deletions test/unit/checks/content-start-position.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ describe('content-start-position', () => {
// ── Fallback to baseUrl ──

it('falls back to baseUrl when no llms.txt', async () => {
mockSitemapNotFound(server, 'http://csp-fb.local');
server.use(
http.get(
'http://csp-fb.local/llms.txt',
Expand Down
19 changes: 3 additions & 16 deletions test/unit/checks/llms-txt-coverage.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { setupServer } from 'msw/node';
import { getCheck } from '../../../src/checks/registry.js';
import { createContext } from '../../../src/runner.js';
import type { DiscoveredFile } from '../../../src/types.js';
import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js';
import {
hasLocaleCodeAt,
filterToUnprefixedLocale,
Expand Down Expand Up @@ -288,15 +289,7 @@ describe('llms-txt-coverage', () => {
const host = 'cov-no-sitemap.local';
const ctx = makeCtx(host, [`http://${host}/docs/page`], '/docs');

server.use(
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/docs/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
http.get(
`http://${host}/docs/sitemap-index.xml`,
() => new HttpResponse('', { status: 404 }),
),
);
mockSitemapNotFound(server, `http://${host}/docs`);

const result = await check.run(ctx);
expect(result.status).toBe('skip');
Expand Down Expand Up @@ -527,10 +520,8 @@ describe('llms-txt-coverage', () => {

const ctx = makeCtx(host, docPages, '/docs');

mockSitemapNotFound(server, `http://${host}/docs`);
server.use(
// No main sitemap
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
// Docs sitemap is an index
http.get(
`http://${host}/docs/sitemap.xml`,
Expand All @@ -548,10 +539,6 @@ describe('llms-txt-coverage', () => {
headers: { 'content-type': 'application/xml' },
}),
),
http.get(
`http://${host}/docs/sitemap-index.xml`,
() => new HttpResponse('', { status: 404 }),
),
);

const result = await check.run(ctx);
Expand Down
1 change: 1 addition & 0 deletions test/unit/checks/markdown-url-support.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,7 @@ describe('markdown-url-support', () => {
// false-positive the check for a /auth/index.html page.
it('does not test /foo.md when /foo/index.html came from sitemap (issue #77 isolation)', async () => {
const requestLog: string[] = [];
mockSitemapNotFound(server, 'http://parentclean.local');
server.use(
http.get('http://parentclean.local/robots.txt', () => new HttpResponse('', { status: 404 })),
http.get(
Expand Down
1 change: 1 addition & 0 deletions test/unit/checks/page-size-html.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ describe('page-size-html', () => {
});

it('falls back to baseUrl when no llms.txt', async () => {
mockSitemapNotFound(server, 'http://ps-html-fb.local');
server.use(
http.get(
'http://ps-html-fb.local/llms.txt',
Expand Down
2 changes: 2 additions & 0 deletions test/unit/checks/page-size-markdown.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ describe('page-size-markdown', () => {

it('works in standalone mode when dependencies never ran', async () => {
mockNoLlmsTxt('ps-md-standalone.local');
mockSitemapNotFound(server, 'http://ps-md-standalone.local');
server.use(
http.get(
'http://ps-md-standalone.local/robots.txt',
Expand Down Expand Up @@ -187,6 +188,7 @@ describe('page-size-markdown', () => {

it('skips in standalone mode when no markdown found', async () => {
mockNoLlmsTxt('ps-md-nomd.local');
mockSitemapNotFound(server, 'http://ps-md-nomd.local');
server.use(
http.get('http://ps-md-nomd.local/robots.txt', () => new HttpResponse('', { status: 404 })),
http.get('http://ps-md-nomd.local/sitemap.xml', () => new HttpResponse('', { status: 404 })),
Expand Down
Loading