Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions application/defs/cheatsheet_defs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from dataclasses import dataclass, field
from typing import Dict, List

SUMMARY_MAX_LENGTH = 500


@dataclass
class CheatsheetRecord:
source: str = field(default="owasp_cheatsheets", init=False)
source_id: str
title: str
hyperlink: str
summary: str
headings: List[str]
raw_markdown_path: str
category_hints: List[str] = field(default_factory=list)
metadata: Dict[str, str] = field(default_factory=dict)

def __post_init__(self):

self.summary = self.summary.strip()[:SUMMARY_MAX_LENGTH]

required_str_fields = {
"source_id": self.source_id,
"title": self.title,
"hyperlink": self.hyperlink,
"summary": self.summary,
"raw_markdown_path": self.raw_markdown_path,
}

list_str_fields = {
"headings": self.headings,
"category_hints": self.category_hints,
}

# Validate required value type string fields.

for field_name, value in required_str_fields.items():
if not isinstance(value, str) or not value.strip():
raise ValueError(
f"CheatsheetRecord: field '{field_name}' "
f"must be a non-empty string, got {value!r}"
)

# Validate required value type list[string] fields.

for field_name, value in list_str_fields.items():
if not isinstance(value, list):
raise ValueError(
f"CheatsheetRecord: field '{field_name}' "
f"must be a list[str], got {type(value)!r}"
)

for item in value:
if not isinstance(item, str):
raise ValueError(
f"CheatsheetRecord : value of '{field_name}' must be a string, got {item!r}"
)

# Validate input for metadata

if not isinstance(self.metadata, dict):
raise ValueError(
"CheatsheetRecord: field 'metadata' must be a dict[str, str]"
)

for key, value in self.metadata.items():
if not isinstance(key, str) or not isinstance(value, str):
raise ValueError(
"CheatsheetRecord: metadata keys and values must be strings, "
f"got {key!r}: {value!r}"
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
import re
from application.defs.cheatsheet_defs import CheatsheetRecord

PARSER_VERSION = "v1"
FALLBACK_USED = "false"
CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"

_TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
_HEADING_RE = re.compile(r"^##\s+(?P<heading>.+)$", re.MULTILINE)
_ANY_HEADING_RE = re.compile(r"^#{1,6}\s+.+$", re.MULTILINE)


def _derive_source_id(source_path: str) -> str:
basename = os.path.basename(source_path)
source_id, _ = os.path.splitext(basename)
return source_id


def _derive_hyperlink(source_path: str) -> str:
source_id = _derive_source_id(source_path)
return f"{CANONICAL_BASE_URL}{source_id}.html"


def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str:
start = heading_match.end()
next_heading = _ANY_HEADING_RE.search(markdown, start)
end = next_heading.start() if next_heading else len(markdown)
return markdown[start:end].strip()


def _extract_summary(markdown: str) -> str:
all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown))

for match in all_heading_matches:
heading_text = match.group().lstrip("#").strip()

if heading_text.lower() == "introduction":
body = _extract_body_after_heading(markdown, match)

if body:
return body
break

for match in all_heading_matches:
body = _extract_body_after_heading(markdown, match)

if body:
return body

raise ValueError("_extract_summary: no summary could be extracted from markdown.")


def extract_cheatsheet_record(
markdown: str,
source_path: str,
) -> CheatsheetRecord:
title_match = _TITLE_RE.search(markdown)
title = title_match.group("title").strip()

headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)]

summary = _extract_summary(markdown)

source_id = _derive_source_id(source_path)
hyperlink = _derive_hyperlink(source_path)

return CheatsheetRecord(
source_id=source_id,
title=title,
hyperlink=hyperlink,
summary=summary,
headings=headings,
raw_markdown_path=source_path,
category_hints=[],
metadata={
"parser_version": PARSER_VERSION,
"fallback_used": FALLBACK_USED,
},
)