Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
3,000 changes: 3,000 additions & 0 deletions application/tests/noise_filter/fixtures/candidate_commits.json

Large diffs are not rendered by default.

3,400 changes: 3,400 additions & 0 deletions application/tests/noise_filter/fixtures/labeled_data.json

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions application/tests/noise_filter/fixtures/module_a_mock.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:0", "artifact_id": "art:OWASP/ASVS:4.0/en/0x12-V3-Authentication.md", "pipeline_run_id": "20260201T020000Z", "text": "Authentication should use MFA", "span": {"index": 0, "total": 3, "heading_path": ["Authentication", "JWT"], "start_char_idx": 0, "end_char_idx": 98, "start_line": 10, "end_line": 12}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc123", "committed_at": "2026-02-01T01:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x12-V3-Authentication.md", "path": "4.0/en/0x12-V3-Authentication.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:1", "artifact_id": "art:OWASP/ASVS:4.0/en/0x13-V4-Access-Control.md", "pipeline_run_id": "20260201T020000Z", "text": "Access control should enforce principle of least privilege", "span": {"index": 1, "total": 5, "heading_path": ["Access Control", "Authorization"], "start_char_idx": 120, "end_char_idx": 198, "start_line": 15, "end_line": 18}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc124", "committed_at": "2026-02-01T02:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x13-V4-Access-Control.md", "path": "4.0/en/0x13-V4-Access-Control.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:2", "artifact_id": "art:OWASP/ASVS:4.0/en/0x14-V5-Validation.md", "pipeline_run_id": "20260201T020000Z", "text": "Input validation must be performed on all user-supplied data", "span": {"index": 2, "total": 4, "heading_path": ["Input Validation", "Server-Side Validation"], "start_char_idx": 200, "end_char_idx": 276, "start_line": 22, "end_line": 25}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc125", "committed_at": "2026-02-01T03:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x14-V5-Validation.md", "path": "4.0/en/0x14-V5-Validation.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:3", "artifact_id": "art:OWASP/ASVS:4.0/en/0x15-V6-Encoding.md", "pipeline_run_id": "20260201T020000Z", "text": "Output encoding should be context-aware and properly applied", "span": {"index": 3, "total": 3, "heading_path": ["Output Encoding", "HTML Encoding"], "start_char_idx": 300, "end_char_idx": 375, "start_line": 30, "end_line": 33}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc126", "committed_at": "2026-02-01T04:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x15-V6-Encoding.md", "path": "4.0/en/0x15-V6-Encoding.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:4", "artifact_id": "art:OWASP/ASVS:4.0/en/0x16-V7-Cryptography.md", "pipeline_run_id": "20260201T020000Z", "text": "Use only strong cryptographic algorithms and adequate key lengths", "span": {"index": 4, "total": 6, "heading_path": ["Cryptography", "Algorithm Selection"], "start_char_idx": 400, "end_char_idx": 487, "start_line": 40, "end_line": 44}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc127", "committed_at": "2026-02-01T05:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x16-V7-Cryptography.md", "path": "4.0/en/0x16-V7-Cryptography.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:5", "artifact_id": "art:OWASP/ASVS:4.0/en/0x17-V8-Errors.md", "pipeline_run_id": "20260201T020000Z", "text": "Error handling should not expose sensitive information", "span": {"index": 5, "total": 4, "heading_path": ["Error Handling", "Information Disclosure"], "start_char_idx": 500, "end_char_idx": 568, "start_line": 50, "end_line": 53}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc128", "committed_at": "2026-02-01T06:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x17-V8-Errors.md", "path": "4.0/en/0x17-V8-Errors.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:6", "artifact_id": "art:OWASP/ASVS:4.0/en/0x18-V9-Communications.md", "pipeline_run_id": "20260201T020000Z", "text": "All communications must be encrypted using TLS 1.2 or higher", "span": {"index": 6, "total": 5, "heading_path": ["Communications Security", "Transport Layer"], "start_char_idx": 600, "end_char_idx": 682, "start_line": 60, "end_line": 64}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc129", "committed_at": "2026-02-01T07:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x18-V9-Communications.md", "path": "4.0/en/0x18-V9-Communications.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:7", "artifact_id": "art:OWASP/ASVS:4.0/en/0x19-V10-Malicious.md", "pipeline_run_id": "20260201T020000Z", "text": "Implement protections against malicious code execution", "span": {"index": 7, "total": 3, "heading_path": ["Malicious Code", "Code Injection"], "start_char_idx": 700, "end_char_idx": 768, "start_line": 70, "end_line": 73}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc130", "committed_at": "2026-02-01T08:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x19-V10-Malicious.md", "path": "4.0/en/0x19-V10-Malicious.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:8", "artifact_id": "art:OWASP/ASVS:4.0/en/0x20-V11-Logic.md", "pipeline_run_id": "20260201T020000Z", "text": "Business logic flaws should be identified through security testing", "span": {"index": 8, "total": 4, "heading_path": ["Business Logic", "Workflow Validation"], "start_char_idx": 800, "end_char_idx": 885, "start_line": 80, "end_line": 84}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc131", "committed_at": "2026-02-01T09:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x20-V11-Logic.md", "path": "4.0/en/0x20-V11-Logic.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:9", "artifact_id": "art:OWASP/ASVS:4.0/en/0x21-V12-Files.md", "pipeline_run_id": "20260201T020000Z", "text": "File uploads should be validated and stored securely", "span": {"index": 9, "total": 3, "heading_path": ["File Upload", "Storage Security"], "start_char_idx": 900, "end_char_idx": 967, "start_line": 90, "end_line": 93}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc132", "committed_at": "2026-02-01T10:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x21-V12-Files.md", "path": "4.0/en/0x21-V12-Files.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:10", "artifact_id": "art:OWASP/ASVS:4.0/en/0x22-V13-API.md", "pipeline_run_id": "20260201T020000Z", "text": "API endpoints must enforce authentication and rate limiting", "span": {"index": 10, "total": 4, "heading_path": ["API Security", "Authentication"], "start_char_idx": 1000, "end_char_idx": 1084, "start_line": 100, "end_line": 104}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc133", "committed_at": "2026-02-01T11:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x22-V13-API.md", "path": "4.0/en/0x22-V13-API.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:11", "artifact_id": "art:OWASP/ASVS:4.0/en/0x23-V14-Configuration.md", "pipeline_run_id": "20260201T020000Z", "text": "Configuration management should follow security best practices", "span": {"index": 11, "total": 5, "heading_path": ["Configuration", "Secrets Management"], "start_char_idx": 1100, "end_char_idx": 1181, "start_line": 110, "end_line": 115}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc134", "committed_at": "2026-02-01T12:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x23-V14-Configuration.md", "path": "4.0/en/0x23-V14-Configuration.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:12", "artifact_id": "art:OWASP/ASVS:4.0/en/0x24-V15-Authentication-Advanced.md", "pipeline_run_id": "20260201T020000Z", "text": "Password policies should enforce complexity and history requirements", "span": {"index": 12, "total": 4, "heading_path": ["Advanced Authentication", "Password Management"], "start_char_idx": 1200, "end_char_idx": 1289, "start_line": 120, "end_line": 124}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc135", "committed_at": "2026-02-01T13:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x24-V15-Authentication-Advanced.md", "path": "4.0/en/0x24-V15-Authentication-Advanced.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:13", "artifact_id": "art:OWASP/ASVS:4.0/en/0x25-V16-CSRF.md", "pipeline_run_id": "20260201T020000Z", "text": "CSRF tokens should be generated and validated for all state-changing requests", "span": {"index": 13, "total": 3, "heading_path": ["CSRF Protection", "Token Implementation"], "start_char_idx": 1300, "end_char_idx": 1394, "start_line": 130, "end_line": 133}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc136", "committed_at": "2026-02-01T14:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x25-V16-CSRF.md", "path": "4.0/en/0x25-V16-CSRF.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:14", "artifact_id": "art:OWASP/ASVS:4.0/en/0x26-V17-Session.md", "pipeline_run_id": "20260201T020000Z", "text": "Session management should use secure session tokens and HttpOnly cookies", "span": {"index": 14, "total": 4, "heading_path": ["Session Management", "Cookie Security"], "start_char_idx": 1400, "end_char_idx": 1489, "start_line": 140, "end_line": 144}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc137", "committed_at": "2026-02-01T15:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x26-V17-Session.md", "path": "4.0/en/0x26-V17-Session.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:15", "artifact_id": "art:OWASP/ASVS:4.0/en/0x27-V18-SQL-Injection.md", "pipeline_run_id": "20260201T020000Z", "text": "Parameterized queries must be used to prevent SQL injection attacks", "span": {"index": 15, "total": 3, "heading_path": ["SQL Injection Prevention", "Query Parameterization"], "start_char_idx": 1500, "end_char_idx": 1583, "start_line": 150, "end_line": 153}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc138", "committed_at": "2026-02-01T16:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x27-V18-SQL-Injection.md", "path": "4.0/en/0x27-V18-SQL-Injection.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:16", "artifact_id": "art:OWASP/ASVS:4.0/en/0x28-V19-Deserialization.md", "pipeline_run_id": "20260201T020000Z", "text": "Deserialization should use safe methods and validate all input data", "span": {"index": 16, "total": 3, "heading_path": ["Deserialization", "Object Deserialization"], "start_char_idx": 1600, "end_char_idx": 1680, "start_line": 160, "end_line": 163}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc139", "committed_at": "2026-02-01T17:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x28-V19-Deserialization.md", "path": "4.0/en/0x28-V19-Deserialization.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:17", "artifact_id": "art:OWASP/ASVS:4.0/en/0x29-V20-Dependency.md", "pipeline_run_id": "20260201T020000Z", "text": "Dependencies should be kept up to date and regularly scanned for vulnerabilities", "span": {"index": 17, "total": 4, "heading_path": ["Dependency Management", "Vulnerability Scanning"], "start_char_idx": 1700, "end_char_idx": 1795, "start_line": 170, "end_line": 174}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc140", "committed_at": "2026-02-01T18:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x29-V20-Dependency.md", "path": "4.0/en/0x29-V20-Dependency.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:18", "artifact_id": "art:OWASP/ASVS:4.0/en/0x30-V21-Logging.md", "pipeline_run_id": "20260201T020000Z", "text": "Security events must be logged and monitored for suspicious activity", "span": {"index": 18, "total": 4, "heading_path": ["Logging and Monitoring", "Event Logging"], "start_char_idx": 1800, "end_char_idx": 1885, "start_line": 180, "end_line": 184}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc141", "committed_at": "2026-02-01T19:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x30-V21-Logging.md", "path": "4.0/en/0x30-V21-Logging.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:19", "artifact_id": "art:OWASP/ASVS:4.0/en/0x31-V22-Mobile.md", "pipeline_run_id": "20260201T020000Z", "text": "Mobile applications must implement platform-specific security controls", "span": {"index": 19, "total": 4, "heading_path": ["Mobile Security", "Platform Controls"], "start_char_idx": 1900, "end_char_idx": 1982, "start_line": 190, "end_line": 194}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc142", "committed_at": "2026-02-01T20:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x31-V22-Mobile.md", "path": "4.0/en/0x31-V22-Mobile.md"}}
Loading
Loading