Skip to content

Commit 77c5afe

Browse files
authored
Merge pull request #37 from OPPIDA/refactor/typing
2 parents e6a0455 + 87f2768 commit 77c5afe

20 files changed

Lines changed: 473 additions & 453 deletions

File tree

codesectools/datasets/BenchmarkJava/dataset.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,15 @@ class TestCode(File):
2525
def __init__(
2626
self,
2727
filepath: Path,
28-
content: str | bytes,
28+
content: bytes,
2929
cwes: list[CWE],
3030
has_vuln: bool,
3131
) -> None:
3232
"""Initialize a TestCode instance.
3333
3434
Args:
3535
filepath: The path to the file.
36-
content: The content of the file, as a string or bytes.
36+
content: The content of the file, as bytes.
3737
cwes: A list of CWEs associated with the file.
3838
has_vuln: A boolean indicating if the vulnerability is real or a false positive test case.
3939
@@ -69,7 +69,7 @@ class BenchmarkJava(PrebuiltFileDataset):
6969
prebuilt_expected = (Path("target/classes/org/owasp/benchmark/testcode"), "*.class")
7070
artifacts_arg = "."
7171

72-
def __init__(self, lang: None | str = None) -> None:
72+
def __init__(self, lang: str = "") -> None:
7373
"""Initialize the BenchmarkJava dataset.
7474
7575
Args:
@@ -79,7 +79,7 @@ def __init__(self, lang: None | str = None) -> None:
7979
"""
8080
super().__init__(lang)
8181

82-
def __eq__(self, other: str | Self) -> bool:
82+
def __eq__(self, other: object) -> bool:
8383
"""Compare this dataset with another object for equality.
8484
8585
Args:
@@ -121,7 +121,7 @@ def download_files(self: Self, test: bool = False) -> None:
121121
for to_delete_testcode in random.sample(testcodes, k=len(testcodes) - 50):
122122
to_delete_testcode.unlink()
123123

124-
def load_dataset(self) -> list[TestCode]:
124+
def load_dataset(self) -> list[File]:
125125
"""Load the BenchmarkJava dataset from its source files.
126126
127127
Reads a CSV file for vulnerability metadata and the corresponding Java
@@ -149,7 +149,7 @@ def load_dataset(self) -> list[TestCode]:
149149
filename = f"{row[0]}.java"
150150
filepath = testcode_dir / filename
151151
if filepath.is_file():
152-
content = filepath.read_text()
152+
content = filepath.read_bytes()
153153
cwes = [CWEs.from_id(int(row[3]))]
154154
has_vuln = True if row[2] == "true" else False
155155
files.append(

codesectools/datasets/CVEfixes/dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import csv
99
from typing import Self
1010

11-
from codesectools.datasets.core.dataset import GitRepo, GitRepoDataset
11+
from codesectools.datasets.core.dataset import File, GitRepo, GitRepoDataset
1212
from codesectools.shared.cwe import CWEs
1313
from codesectools.utils import DATA_DIR
1414

@@ -32,14 +32,14 @@ class CVEfixes(GitRepoDataset):
3232
license = "CC BY 4.0"
3333
license_url = "https://creativecommons.org/licenses/by/4.0/"
3434

35-
def __init__(self, lang: str | None = None) -> None:
35+
def __init__(self, lang: str = "") -> None:
3636
"""Initialize the CVEfixes dataset.
3737
3838
Args:
3939
lang: The programming language of the dataset to load.
4040
4141
"""
42-
self.max_repo_size = 100e6
42+
self.max_repo_size = 100 * 10**6
4343
super().__init__(lang)
4444

4545
def download_files(self: Self, test: bool = False) -> None:
@@ -53,7 +53,7 @@ def download_files(self: Self, test: bool = False) -> None:
5353

5454
def load_dataset(
5555
self,
56-
) -> list[GitRepo]:
56+
) -> list[File]:
5757
"""Load the CVEfixes dataset from its source CSV file.
5858
5959
Parses a CSV file containing information about CVEs, repositories,

codesectools/datasets/JulietTestSuiteC/dataset.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ class TestCode(File):
2424
def __init__(
2525
self,
2626
filepath: Path,
27-
content: str | bytes,
27+
content: bytes,
2828
cwes: list[CWE],
2929
has_vuln: bool,
3030
) -> None:
3131
"""Initialize a TestCode instance.
3232
3333
Args:
3434
filepath: The path to the file.
35-
content: The content of the file, as a string or bytes.
35+
content: The content of the file, as bytes.
3636
cwes: A list of CWEs associated with the file.
3737
has_vuln: A boolean indicating if the vulnerability is real or a false positive test case.
3838
@@ -58,7 +58,7 @@ class JulietTestSuiteC(PrebuiltFileDataset):
5858
prebuilt_expected = (Path("."), "compile_commands.json")
5959
artifacts_arg = "compile_commands.json"
6060

61-
def __init__(self, lang: None | str = None) -> None:
61+
def __init__(self, lang: str = "") -> None:
6262
"""Initialize the JulietTestSuiteC dataset.
6363
6464
Args:
@@ -68,7 +68,7 @@ def __init__(self, lang: None | str = None) -> None:
6868
"""
6969
super().__init__(lang)
7070

71-
def __eq__(self, other: str | Self) -> bool:
71+
def __eq__(self, other: object) -> bool:
7272
"""Compare this dataset with another object for equality.
7373
7474
Args:
@@ -118,7 +118,7 @@ def download_files(self: Self, test: bool = False) -> None:
118118
if not cwe_dir.name.startswith("CWE835"):
119119
shutil.rmtree(cwe_dir)
120120

121-
def load_dataset(self) -> list[TestCode]:
121+
def load_dataset(self) -> list[File]:
122122
"""Load the JulietTestSuiteC dataset from the source files.
123123
124124
Parses the `manifest.xml` file to identify vulnerabilities in the C/C++

codesectools/datasets/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"""
1313

1414
import importlib
15-
from typing import Any
15+
from typing import Any, Type
1616

1717
from codesectools.datasets.core.dataset import Dataset
1818
from codesectools.utils import DATASETS_DIR
@@ -37,7 +37,7 @@ def _load(self) -> None:
3737
self.dataset_module = importlib.import_module(
3838
f"codesectools.datasets.{self.name}.dataset"
3939
)
40-
self.dataset: Dataset = getattr(self.dataset_module, self.name)
40+
self.dataset: Type[Dataset] = getattr(self.dataset_module, self.name)
4141

4242
self.loaded = True
4343

codesectools/datasets/core/dataset.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
from codesectools.utils import USER_CACHE_DIR
2020

2121
if TYPE_CHECKING:
22-
from typing import Self
23-
2422
from codesectools.sasts.core.parser import AnalysisResult, Defect
2523
from codesectools.shared.cwe import CWE
2624

@@ -44,7 +42,7 @@ class Dataset(ABC):
4442
license: str
4543
license_url: str
4644

47-
def __init__(self, lang: str | None = None) -> None:
45+
def __init__(self, lang: str = "") -> None:
4846
"""Initialize the Dataset instance.
4947
5048
Set up paths and load the dataset if a language is specified.
@@ -168,6 +166,7 @@ class PrebuiltDatasetMixin:
168166
169167
"""
170168

169+
name: str
171170
build_command: str
172171
prebuilt_expected: tuple[Path, str]
173172
artifacts_arg: str
@@ -223,14 +222,13 @@ class File(DatasetUnit):
223222
"""
224223

225224
def __init__(
226-
self, filepath: Path, content: str | bytes, cwes: list[CWE], has_vuln: bool
225+
self, filepath: Path, content: bytes, cwes: list[CWE], has_vuln: bool
227226
) -> None:
228227
"""Initialize a File instance.
229228
230229
Args:
231230
filepath: The relative path of the file.
232-
content: The content of the file, as a string or bytes. It will be
233-
converted to bytes if provided as a string.
231+
content: The content of the file, as bytes.
234232
cwes: A list of CWEs associated with the file.
235233
has_vuln: True if the vulnerability is real, False if it's
236234
intended to be a false positive test case.
@@ -242,9 +240,6 @@ def __init__(
242240
self.cwes = cwes
243241
self.has_vuln = has_vuln
244242

245-
if isinstance(content, str):
246-
self.content = content.encode()
247-
248243
def __repr__(self) -> str:
249244
"""Return a developer-friendly string representation of the File.
250245
@@ -257,7 +252,7 @@ def __repr__(self) -> str:
257252
cwes: \t{self.cwes}
258253
)"""
259254

260-
def __eq__(self, other: str | Path | Self) -> bool:
255+
def __eq__(self, other: object) -> bool:
261256
"""Compare this File with another object for equality based on filepath.
262257
263258
Args:
@@ -515,7 +510,7 @@ def __repr__(self) -> str:
515510
files: \t{self.files}
516511
)"""
517512

518-
def __eq__(self, other: str | Self) -> bool:
513+
def __eq__(self, other: object) -> bool:
519514
"""Compare this GitRepo with another object for equality based on name.
520515
521516
Args:
@@ -589,7 +584,7 @@ def validate(self, analysis_results: list[AnalysisResult]) -> GitRepoDatasetData
589584
validated_repos = []
590585

591586
for analysis_result in analysis_results:
592-
repo = self.repos[self.repos.index(analysis_result.name)]
587+
repo = self.repos[self.repos.index(analysis_result.name)] # ty:ignore[invalid-argument-type]
593588

594589
# 1. Process reported defects to get unique (file, cwe) pairs
595590
# and keep one original Defect object for each to retain metadata.
@@ -663,8 +658,7 @@ class GitRepoDatasetData(BenchmarkData):
663658
664659
Attributes:
665660
dataset (GitRepoDataset): The dataset used for the benchmark.
666-
validated_repos (list[dict]): A list of dictionaries, each containing
667-
the validation results for a single repository.
661+
validated_repos (list[dict]): A list of validation results per repository.
668662
total_repo_number (int): The total number of repositories in the dataset.
669663
defect_numbers (int): The total number of defects found across all repos.
670664

codesectools/sasts/core/parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from abc import ABC, abstractmethod
1010
from pathlib import Path
11-
from typing import Any, Self
11+
from typing import Self
1212

1313
from codesectools.shared.cwe import CWE
1414

@@ -26,7 +26,7 @@ class Defect:
2626
cwe (CWE): The CWE associated with the defect.
2727
message (str): The description of the defect.
2828
location (tuple[int, int] | None): A tuple with the start and end line numbers of the defect.
29-
data (tuple[Any]): Raw data from the SAST tool for this defect.
29+
data (dict): Raw data from the SAST tool for this defect.
3030
3131
"""
3232

@@ -40,7 +40,7 @@ def __init__(
4040
cwe: CWE,
4141
message: str,
4242
lines: list[int] | None,
43-
data: tuple[Any],
43+
data: dict,
4444
) -> None:
4545
"""Initialize a Defect instance.
4646
@@ -90,7 +90,7 @@ class AnalysisResult(ABC):
9090
defects (list[Defect]): A list of `Defect` objects found.
9191
time (float): The duration of the analysis in seconds.
9292
loc (int): The number of lines of code analyzed.
93-
data (tuple[Any]): Raw data from the SAST tool's output.
93+
data (tuple): Raw data from the SAST tool's output.
9494
9595
"""
9696

@@ -103,7 +103,7 @@ def __init__(
103103
defects: list[Defect],
104104
time: float,
105105
loc: int,
106-
data: tuple[Any],
106+
data: tuple,
107107
) -> None:
108108
"""Initialize an AnalysisResult instance.
109109
@@ -161,7 +161,7 @@ def load_from_output_dir(cls, output_dir: Path) -> Self:
161161
pass
162162

163163
@classmethod
164-
def load_from_output_dirs(cls, output_dirs: list[str]) -> list[Self]:
164+
def load_from_output_dirs(cls, output_dirs: list[Path]) -> list[Self]:
165165
"""Load and parse analysis results from multiple directories.
166166
167167
Args:

codesectools/sasts/core/sast/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ class PrebuiltSAST(SAST):
342342

343343
def analyze_files(
344344
self,
345-
dataset: PrebuiltFileDataset,
345+
dataset: FileDataset,
346346
overwrite: bool = False,
347347
testing: bool = False,
348348
) -> None:
@@ -359,6 +359,8 @@ def analyze_files(
359359
"""
360360
from rich.panel import Panel
361361

362+
assert isinstance(dataset, PrebuiltFileDataset)
363+
362364
if not dataset.is_built():
363365
prebuilt_dir, prebuilt_glob = dataset.prebuilt_expected
364366
panel = Panel(

codesectools/sasts/core/sast/requirements.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,27 @@
11
"""Define requirements for SAST tools and their fulfillment status."""
22

3+
from __future__ import annotations
4+
35
import shutil
46
from abc import ABC, abstractmethod
5-
from pathlib import Path
6-
from typing import Any, Literal, Self
7+
from typing import TYPE_CHECKING, Any, Literal
78

89
import typer
910
from rich import print
1011

1112
from codesectools.utils import USER_CACHE_DIR, USER_CONFIG_DIR
1213

14+
if TYPE_CHECKING:
15+
from pathlib import Path
16+
1317

1418
class SASTRequirement(ABC):
1519
"""Represent a single requirement for a SAST tool to be functional."""
1620

1721
def __init__(
1822
self,
1923
name: str,
20-
depends_on: list[Self] | None = None,
24+
depends_on: list[SASTRequirement] | None = None,
2125
instruction: str | None = None,
2226
url: str | None = None,
2327
doc: bool = False,
@@ -94,6 +98,7 @@ class Config(SASTRequirement):
9498
def __init__(
9599
self,
96100
name: str,
101+
sast_name: str,
97102
depends_on: list[SASTRequirement] | None = None,
98103
instruction: str | None = None,
99104
url: str | None = None,
@@ -103,19 +108,21 @@ def __init__(
103108
104109
Args:
105110
name: The name of the requirement.
111+
sast_name: The name of the SAST tool this config belongs to.
106112
depends_on: A list of other requirements that must be fulfilled first.
107113
instruction: A short instruction on how to download the requirement.
108114
url: A URL for more detailed instructions.
109115
doc: A flag indicating if the instruction is available in the documentation.
110116
111117
"""
118+
self.sast_name = sast_name
112119
super().__init__(
113120
name=name, depends_on=depends_on, instruction=instruction, url=url, doc=doc
114121
)
115122

116-
def is_fulfilled(self, sast_name: str, **kwargs: Any) -> bool:
123+
def is_fulfilled(self, **kwargs: Any) -> bool:
117124
"""Check if the configuration file exists for the given SAST tool."""
118-
return (USER_CONFIG_DIR / sast_name / self.name).is_file()
125+
return (USER_CONFIG_DIR / self.sast_name / self.name).is_file()
119126

120127

121128
class Binary(SASTRequirement):

0 commit comments

Comments
 (0)