Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apps/web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
"@tailwindcss/vite": "4.1.18",
"astro": "^6.3.1",
"pagefind": "^1.5.2",
"sanitize-html": "^2.17.3",
"svelte": "^5.55.5",
"tailwindcss": "4.1.18"
},
"devDependencies": {
"@astrojs/check": "^0.9.9",
"@types/sanitize-html": "^2.16.1",
"typescript": "^5.8.0",
"vitest": "^4.1.6"
}
Expand Down
8 changes: 6 additions & 2 deletions apps/web/src/__tests__/github.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,12 @@ describe('sanitizeContent', () => {
expect(sanitizeContent('before<!-- hidden instruction -->after')).toBe('beforeafter');
});

it('decodes HTML entities and re-strips', () => {
expect(sanitizeContent('&lt;script&gt;alert(1)&lt;/script&gt;')).toBe('alert(1)');
it('keeps encoded entities encoded (cannot bypass the parser)', () => {
// sanitize-html parses as HTML and preserves `&lt;` etc. as entities — so
// the rendered output is literal text, never a re-parsed <script> element.
const out = sanitizeContent('&lt;script&gt;alert(1)&lt;/script&gt;');
expect(out).not.toContain('<');
expect(out).not.toContain('>');
});

it('handles nested/malformed tags', () => {
Expand Down
42 changes: 18 additions & 24 deletions apps/web/src/lib/github.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Octokit } from "@octokit/rest";
import sanitizeHtml from "sanitize-html";

export interface CommitInfo {
sha: string;
Expand Down Expand Up @@ -123,35 +124,28 @@ export async function getFileAtRef(
}

/**
* Strip HTML tags and dangerous content to prevent XSS.
* Handles: tags, encoded entities, script/style blocks, event handlers.
* For untrusted content only — trusted Astro/Svelte output doesn't need this.
* Strip all HTML tags to plain text.
* Uses sanitize-html (a real HTML parser), not regex, so encoded entities and
* malformed/nested tags can't bypass it. For displaying untrusted content as text.
*/
export function sanitizeContent(raw: string): string {
return raw
// Remove script/style blocks entirely (including content)
.replace(/<script[\s\S]*?<\/script>/gi, "")
.replace(/<style[\s\S]*?<\/style>/gi, "")
// Remove HTML comments (can contain instructions/hidden content)
.replace(/<!--[\s\S]*?-->/g, "")
// Remove all HTML tags
.replace(/<[^>]*>/g, "")
// Decode common HTML entities
.replace(/&lt;/g, "<").replace(/&gt;/g, ">")
.replace(/&amp;/g, "&").replace(/&quot;/g, '"')
.replace(/&#x27;/g, "'").replace(/&#x2F;/g, "/")
// Re-strip any tags that were hiding inside encoded entities
.replace(/<[^>]*>/g, "");
return sanitizeHtml(raw, {
allowedTags: [],
allowedAttributes: {},
disallowedTagsMode: "discard",
});
}

/** Sanitize Pagefind excerpt HTML — allow only <mark> highlight tags */
/**
* Sanitize Pagefind excerpt HTML, preserving only <mark> highlight tags.
* Used in client-side search results rendered via {@html ...}.
*/
export function sanitizeExcerpt(html: string): string {
// Pagefind wraps matches in <mark> tags — preserve those, strip everything else
return html
.replace(/<script[\s\S]*?<\/script>/gi, "")
.replace(/<style[\s\S]*?<\/style>/gi, "")
.replace(/<!--[\s\S]*?-->/g, "")
.replace(/<(?!\/?mark[ >])[^>]*>/gi, "");
return sanitizeHtml(html, {
allowedTags: ["mark"],
allowedAttributes: {},
disallowedTagsMode: "discard",
});
}

/** Format a pl-* tag name into a human-readable label */
Expand Down
52 changes: 52 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading