OWASP · Abhijeet2409 · May 31, 2026 · May 31, 2026
diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
@@ -0,0 +1,72 @@
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+SUMMARY_MAX_LENGTH = 500
+
+
+@dataclass
+class CheatsheetRecord:
+    source: str = field(default="owasp_cheatsheets", init=False)
+    source_id: str
+    title: str
+    hyperlink: str
+    summary: str
+    headings: List[str]
+    raw_markdown_path: str
+    category_hints: List[str] = field(default_factory=list)
+    metadata: Dict[str, str] = field(default_factory=dict)
+
+    def __post_init__(self):
+
+        self.summary = self.summary.strip()[:SUMMARY_MAX_LENGTH]
+
+        required_str_fields = {
+            "source_id": self.source_id,
+            "title": self.title,
+            "hyperlink": self.hyperlink,
+            "summary": self.summary,
+            "raw_markdown_path": self.raw_markdown_path,
+        }
+
+        list_str_fields = {
+            "headings": self.headings,
+            "category_hints": self.category_hints,
+        }
+
+        # Validate required value type string fields.
+
+        for field_name, value in required_str_fields.items():
+            if not isinstance(value, str) or not value.strip():
+                raise ValueError(
+                    f"CheatsheetRecord: field '{field_name}' "
+                    f"must be a non-empty string, got {value!r}"
+                )
+
+        # Validate required value type list[string] fields.
+
+        for field_name, value in list_str_fields.items():
+            if not isinstance(value, list):
+                raise ValueError(
+                    f"CheatsheetRecord: field '{field_name}' "
+                    f"must be a list[str], got {type(value)!r}"
+                )
+
+            for item in value:
+                if not isinstance(item, str):
+                    raise ValueError(
+                        f"CheatsheetRecord : value of '{field_name}' must be a string, got {item!r}"
+                    )
+
+        # Validate input for metadata
+
+        if not isinstance(self.metadata, dict):
+            raise ValueError(
+                "CheatsheetRecord: field 'metadata' must be a dict[str, str]"
+            )
+
+        for key, value in self.metadata.items():
+            if not isinstance(key, str) or not isinstance(value, str):
+                raise ValueError(
+                    "CheatsheetRecord: metadata keys and values must be strings, "
+                    f"got {key!r}: {value!r}"
+                )
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -0,0 +1,80 @@
+import os
+import re
+from application.defs.cheatsheet_defs import CheatsheetRecord
+
+PARSER_VERSION = "v1"
+FALLBACK_USED = "false"
+CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
+
+_TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
+_HEADING_RE = re.compile(r"^##\s+(?P<heading>.+)$", re.MULTILINE)
+_ANY_HEADING_RE = re.compile(r"^#{1,6}\s+.+$", re.MULTILINE)
+
+
+def _derive_source_id(source_path: str) -> str:
+    basename = os.path.basename(source_path)
+    source_id, _ = os.path.splitext(basename)
+    return source_id
+
+
+def _derive_hyperlink(source_path: str) -> str:
+    source_id = _derive_source_id(source_path)
+    return f"{CANONICAL_BASE_URL}{source_id}.html"
+
+
+def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str:
+    start = heading_match.end()
+    next_heading = _ANY_HEADING_RE.search(markdown, start)
+    end = next_heading.start() if next_heading else len(markdown)
+    return markdown[start:end].strip()
+
+
+def _extract_summary(markdown: str) -> str:
+    all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown))
+
+    for match in all_heading_matches:
+        heading_text = match.group().lstrip("#").strip()
+
+        if heading_text.lower() == "introduction":
+            body = _extract_body_after_heading(markdown, match)
+
+            if body:
+                return body
+            break
+
+    for match in all_heading_matches:
+        body = _extract_body_after_heading(markdown, match)
+
+        if body:
+            return body
+
+    raise ValueError("_extract_summary: no summary could be extracted from markdown.")
+
+
+def extract_cheatsheet_record(
+    markdown: str,
+    source_path: str,
+) -> CheatsheetRecord:
+    title_match = _TITLE_RE.search(markdown)
+    title = title_match.group("title").strip()
+
+    headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)]
+
+    summary = _extract_summary(markdown)
+
+    source_id = _derive_source_id(source_path)
+    hyperlink = _derive_hyperlink(source_path)
+
+    return CheatsheetRecord(
+        source_id=source_id,
+        title=title,
+        hyperlink=hyperlink,
+        summary=summary,
+        headings=headings,
+        raw_markdown_path=source_path,
+        category_hints=[],
+        metadata={
+            "parser_version": PARSER_VERSION,
+            "fallback_used": FALLBACK_USED,
+        },
+    )