|
| 1 | +################################################################################ |
| 2 | +# Copyright IBM Corporation 2024 |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | +################################################################################ |
| 16 | + |
| 17 | +"""Content-addressed cache locations for the Python analysis backend. |
| 18 | +
|
| 19 | +Two cache tiers, keyed so the *expensive* artifact (the virtualenv) |
| 20 | +survives source edits: |
| 21 | +
|
| 22 | +- **backend tier** — ``<root>/venvs/<dep_hash>/``, keyed by the |
| 23 | + *dependency manifest* hash. ``codeanalyzer-python`` rebuilds the |
| 24 | + virtualenv only when dependencies change, so this directory makes the |
| 25 | + ~30s ``pip install`` reusable across every source edit. |
| 26 | +
|
| 27 | + Caveat: this directory ALSO holds the CodeQL database |
| 28 | + (``<dir>/.codeanalyzer/.../codeql/<name>-db``). The CodeQL DB is a |
| 29 | + function of the *source*, not the dependencies — ``codeanalyzer-python`` |
| 30 | + manages its own ``.checksum``-based invalidation and rebuilds the DB |
| 31 | + in place on any ``*.py`` change. The dep-hash key does NOT (and |
| 32 | + cannot) make the CodeQL DB survive source edits; it only keeps the |
| 33 | + venv stable. Splitting the DB onto a source-keyed root requires an |
| 34 | + upstream change, since ``codeanalyzer-python`` couples venv and DB |
| 35 | + under one ``cache_dir``. |
| 36 | +
|
| 37 | +- **analysis tier** — ``<root>/cache/<key[:2]>/<key>/analysis.json``, |
| 38 | + keyed by the full composite (backend version, analysis level, CodeQL |
| 39 | + flag, target files, full source-tree hash). This is what makes |
| 40 | + revisiting an exact prior source state cheap — the final |
| 41 | + ``analysis.json`` is returned without rebuilding the DB at all. |
| 42 | +
|
| 43 | +``<root>`` is ``$CLDK_CACHE_DIR`` when set, else ``~/.cldk``. |
| 44 | +""" |
| 45 | + |
| 46 | +from __future__ import annotations |
| 47 | + |
| 48 | +import hashlib |
| 49 | +import os |
| 50 | +from importlib.metadata import PackageNotFoundError, version |
| 51 | +from pathlib import Path |
| 52 | +from typing import Iterable, List |
| 53 | + |
| 54 | +# Directories that never contribute to analysis input. |
| 55 | +_EXCLUDED_DIR_PARTS = { |
| 56 | + ".git", |
| 57 | + ".venv", |
| 58 | + "venv", |
| 59 | + ".codeanalyzer", |
| 60 | + ".cldk-cache", |
| 61 | + "__pycache__", |
| 62 | + "site-packages", |
| 63 | + ".mypy_cache", |
| 64 | + ".pytest_cache", |
| 65 | +} |
| 66 | + |
| 67 | +# Files whose content determines the virtualenv (dependency tier key). |
| 68 | +_DEPENDENCY_MANIFESTS = ( |
| 69 | + "requirements.txt", |
| 70 | + "requirements-dev.txt", |
| 71 | + "dev-requirements.txt", |
| 72 | + "test-requirements.txt", |
| 73 | + "pyproject.toml", |
| 74 | + "setup.py", |
| 75 | + "setup.cfg", |
| 76 | + "Pipfile", |
| 77 | + "Pipfile.lock", |
| 78 | + "poetry.lock", |
| 79 | + "uv.lock", |
| 80 | +) |
| 81 | + |
| 82 | + |
| 83 | +def backend_version() -> str: |
| 84 | + """Installed ``codeanalyzer-python`` version, or ``unknown``. |
| 85 | +
|
| 86 | + Part of the analysis key: a backend upgrade can change the schema |
| 87 | + or extraction logic, so cached artifacts must not survive it. |
| 88 | + """ |
| 89 | + try: |
| 90 | + return version("codeanalyzer-python") |
| 91 | + except PackageNotFoundError: |
| 92 | + return "unknown" |
| 93 | + |
| 94 | + |
| 95 | +def _iter_source_files(project_dir: Path) -> Iterable[Path]: |
| 96 | + """Yield analysis-relevant ``*.py`` files in deterministic order.""" |
| 97 | + for path in sorted(project_dir.rglob("*.py")): |
| 98 | + if _EXCLUDED_DIR_PARTS.intersection(path.parts): |
| 99 | + continue |
| 100 | + yield path |
| 101 | + |
| 102 | + |
| 103 | +def tree_hash(project_dir: Path) -> str: |
| 104 | + """SHA256 over every analysis-relevant ``*.py`` file's bytes. |
| 105 | +
|
| 106 | + Mirrors ``codeanalyzer-python``'s own checksum semantics so the two |
| 107 | + layers agree on "did the source change?". Path-relative names are |
| 108 | + folded in so a rename alone busts the cache. |
| 109 | + """ |
| 110 | + digest = hashlib.sha256() |
| 111 | + for path in _iter_source_files(project_dir): |
| 112 | + digest.update(str(path.relative_to(project_dir)).encode()) |
| 113 | + digest.update(b"\0") |
| 114 | + digest.update(path.read_bytes()) |
| 115 | + return digest.hexdigest() |
| 116 | + |
| 117 | + |
| 118 | +def dependency_hash(project_dir: Path) -> str: |
| 119 | + """SHA256 over the dependency manifests that exist in the project. |
| 120 | +
|
| 121 | + Keys the virtualenv tier. Source edits do not change this, so the |
| 122 | + (slow) ``pip install`` is reused until dependencies actually move. |
| 123 | + """ |
| 124 | + digest = hashlib.sha256() |
| 125 | + for name in _DEPENDENCY_MANIFESTS: |
| 126 | + manifest = project_dir / name |
| 127 | + if manifest.is_file(): |
| 128 | + digest.update(name.encode()) |
| 129 | + digest.update(b"\0") |
| 130 | + digest.update(manifest.read_bytes()) |
| 131 | + # Empty digest (no manifests) still yields a stable value. |
| 132 | + return digest.hexdigest() |
| 133 | + |
| 134 | + |
| 135 | +def compute_cache_key( |
| 136 | + project_dir: Path, |
| 137 | + analysis_level: str, |
| 138 | + use_codeql: bool, |
| 139 | + target_files: List[str] | None, |
| 140 | +) -> str: |
| 141 | + """Composite key for the analysis tier. |
| 142 | +
|
| 143 | + Any input that changes the produced ``analysis.json`` is folded in: |
| 144 | + backend version, analysis level, CodeQL flag, the (sorted) target |
| 145 | + file list, and the full source-tree hash. |
| 146 | + """ |
| 147 | + parts = [ |
| 148 | + backend_version(), |
| 149 | + str(analysis_level), |
| 150 | + "codeql=1" if use_codeql else "codeql=0", |
| 151 | + ",".join(sorted(t.strip() for t in target_files)) if target_files else "", |
| 152 | + tree_hash(project_dir), |
| 153 | + ] |
| 154 | + return hashlib.sha256("\x1f".join(parts).encode()).hexdigest() |
| 155 | + |
| 156 | + |
| 157 | +def cache_root() -> Path: |
| 158 | + """Cache root: ``$CLDK_CACHE_DIR`` if set, else ``~/.cldk``.""" |
| 159 | + override = os.environ.get("CLDK_CACHE_DIR") |
| 160 | + return Path(override).expanduser() if override else Path.home() / ".cldk" |
| 161 | + |
| 162 | + |
| 163 | +def default_backend_cache_dir(project_dir: Path) -> Path: |
| 164 | + """Backend tier root — keyed by dependency manifest hash. |
| 165 | +
|
| 166 | + Keeps the virtualenv stable across source edits. The CodeQL DB also |
| 167 | + lives here but is invalidated by ``codeanalyzer-python`` itself on |
| 168 | + any source change (see module docstring) — the dep-hash key does not |
| 169 | + extend the DB's lifetime. |
| 170 | + """ |
| 171 | + return cache_root() / "venvs" / dependency_hash(project_dir) |
| 172 | + |
| 173 | + |
| 174 | +def default_analysis_dir( |
| 175 | + project_dir: Path, |
| 176 | + analysis_level: str, |
| 177 | + use_codeql: bool, |
| 178 | + target_files: List[str] | None, |
| 179 | +) -> Path: |
| 180 | + """Analysis-JSON tier — keyed by the full composite, prefix-sharded.""" |
| 181 | + key = compute_cache_key(project_dir, analysis_level, use_codeql, target_files) |
| 182 | + return cache_root() / "cache" / key[:2] / key |
0 commit comments