Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions application/tests/harvester_test/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
tests for Module A configuration layer (empty for now)
"""
17 changes: 17 additions & 0 deletions application/tests/harvester_test/fixtures/invalid_chunk_size.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
repositories:
- type: github

owner: OWASP
repo: ASVS

paths:
include:
- "4.0/en/**/*.md"

chunking:
strategy: markdown
max_tokens: 1200

polling:
mode: incremental
interval_minutes: 60
17 changes: 17 additions & 0 deletions application/tests/harvester_test/fixtures/invalid_missing_id.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
repositories:
- type: github

owner: OWASP
repo: ASVS

paths:
include:
- "4.0/en/**/*.md"

chunking:
strategy: markdown
max_tokens: 1200

polling:
mode: incremental
interval_minutes: 60
4 changes: 4 additions & 0 deletions application/tests/harvester_test/fixtures/invalid_yaml.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
repositories:
- repository_id: owasp-top10
source:
type github
16 changes: 16 additions & 0 deletions application/tests/harvester_test/fixtures/valid_repos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
repositories:
- id: owasp-asvs
type: github
owner: OWASP
repo: ASVS
paths:
include:
- "4.0/en/**/*.md"

chunking:
strategy: markdown
max_tokens: 1200

polling:
mode: incremental
interval_minutes: 60
45 changes: 45 additions & 0 deletions application/tests/harvester_test/test_config_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from pathlib import Path
import pytest
from application.utils.harvester.config_loader import (
ConfigLoaderError,
load_repo_config,
)

FIXTURES_DIR = Path(__file__).parent / "fixtures"


def test_load_valid_config():
config_path = FIXTURES_DIR / "valid_repos.yaml"

config = load_repo_config(config_path)

assert len(config.repositories) == 1

repo = config.repositories[0]

assert repo.id == "owasp-asvs"
assert repo.owner == "OWASP"
assert repo.repo == "ASVS"


def test_missing_repository_id():
config_path = FIXTURES_DIR / "invalid_missing_id.yaml"
with pytest.raises(ConfigLoaderError):
load_repo_config(config_path)


def test_invalid_chunk_size():
config_path = FIXTURES_DIR / "invalid_chunk_size.yaml"
with pytest.raises(ConfigLoaderError):
load_repo_config(config_path)


def test_invalid_yaml_syntax():
config_path = FIXTURES_DIR / "invalid_yaml.yaml"
with pytest.raises(ConfigLoaderError):
load_repo_config(config_path)


def test_missing_config_file():
with pytest.raises(FileNotFoundError):
load_repo_config("does_not_exist.yaml")
22 changes: 22 additions & 0 deletions application/utils/harvester/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from .config_loader import (
ConfigLoaderError,
load_repo_config,
)

from .schemas import (
ChunkingConfig,
PathRules,
PollingConfig,
RepositoryConfig,
ReposFile,
)

__all__ = [
"ChunkingConfig",
"ConfigLoaderError",
"PathRules",
"PollingConfig",
"RepositoryConfig",
"ReposFile",
"load_repo_config",
]
26 changes: 26 additions & 0 deletions application/utils/harvester/config_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pathlib import Path
import yaml
from pydantic import ValidationError
from .schemas import ReposFile


class ConfigLoaderError(Exception):
# when repo config loading fails
pass


def load_repo_config(path: str | Path) -> ReposFile:
config_path = Path(path)

if not config_path.exists():
raise FileNotFoundError(f"Configuration file not found: {config_path}")
try:
with config_path.open("r", encoding="utf-8") as file:
raw_config = yaml.safe_load(file)
except yaml.YAMLError as exc:
raise ConfigLoaderError(f"Invalid YAML syntax in {config_path}") from exc

try:
return ReposFile.model_validate(raw_config)
except ValidationError as exc:
raise ConfigLoaderError(f"schema validation failed for {config_path}") from exc
43 changes: 43 additions & 0 deletions application/utils/harvester/repos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
repositories:
- id: owasp-asvs
type: github
enabled: true
owner: OWASP
repo: ASVS
branch: master
paths:
include:
- "4.0/en/**/*.md"

exclude:
- "**/archive/**"

chunking:
strategy: markdown
max_tokens: 1200
overlap_tokens: 100

polling:
mode: incremental
interval_minutes: 60

- id: owasp-cheatsheets
type: github
enabled: true

owner: OWASP
repo: CheatSheetSeries
branch: master

paths:
include:
- "cheatsheets/**/*.md"

chunking:
strategy: markdown
max_tokens: 1000
overlap_tokens: 100

polling:
mode: incremental
interval_minutes: 120
102 changes: 102 additions & 0 deletions application/utils/harvester/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# core routes:
# PathRules
# ChunkingConfig
# ollingConfig
# RepositoryConfig
# ReposFile

from typing import Literal
from pydantic import BaseModel, Field, ConfigDict


# this will control which repo paths are included and excluded during ingestions
class PathRules(BaseModel):
model_config = ConfigDict(extra="forbid")

include: list[str] = Field(
...,
min_length=1,
description="Glob patterns to include during ingestions",
)
exclude: list[str] = Field(
default_factory=list, description="Glob patterns to exclude during ingestions"
)


# this will define how the harvested data should be chunked before downstream
class ChunkingConfig(BaseModel):
model_config = ConfigDict(extra="forbid")

strategy: Literal["markdown", "plaintext"] = Field(
...,
description="Chunking startergy used for text segmentation",
)

max_tokens: int = Field(..., gt=0, description="max toekn size per chunk")

overlap_tokens: int = Field( # a bit concerned about this
ge=0, # this can also be = 0 i suppose
default=100,
description="token overlap between adjacent chunks",
)


# this one defines repository synchronize behaviour
class PollingConfig(BaseModel):
model_config = ConfigDict(extra="forbid")

mode: Literal["full", "incremental"] = Field(
..., description="repository sync mode"
)

interval_minutes: int = Field(..., gt=0, description="polling interval in minutes")


# top level repository ingestion configuration
class RepositoryConfig(BaseModel):
model_config = ConfigDict(extra="forbid")

id: str = Field(
...,
min_length=1,
description="unique repository identifier.",
)

type: Literal["github"] = Field(
...,
description="repository source type.",
)
enabled: bool = Field(
default=True,
description="whether ingestion is enabled for this repository.",
)
owner: str = Field(
...,
min_length=1,
description="repository organization.",
)
repo: str = Field(
...,
min_length=1,
description="repository name.",
)
branch: str = Field(
default="main",
min_length=1,
description="Repository branch to ingest.",
)

paths: PathRules
chunking: ChunkingConfig
polling: PollingConfig


# Root configuration object loaded from repos.yaml.
class ReposFile(BaseModel):
model_config = ConfigDict(extra="forbid")

repositories: list[RepositoryConfig] = Field(
...,
min_length=1,
description="List of repositories configured for ingestion.",
)
Loading