Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions FindOpenStudioSDK.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ set(OPENSTUDIO_VERSION_PRERELEASE "")
# Enter SHA, always, eg "+79857912c4"
set(OPENSTUDIO_VERSION_SHA "+241b8abb4d")

set(ENERGYPLUS_VERSION_MAJOR 25)
set(ENERGYPLUS_VERSION_MINOR 1)
set(BIGLADDERSOFTWARE_DOC_BASE_URL "https://bigladdersoftware.com/epx/docs/${ENERGYPLUS_VERSION_MAJOR}-${ENERGYPLUS_VERSION_MINOR}/input-output-reference/")

# Paths where the cmake-downloaded archives will be put
set(OPENSTUDIO_ARCHIVE_DIR "${PROJECT_BINARY_DIR}/OpenStudio-${OPENSTUDIO_VERSION}")

Expand Down
213 changes: 213 additions & 0 deletions scripts/check_doc_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
check_doc_urls.py - Verify BigLadder EnergyPlus I/O Reference URLs in OpenStudioApp source.

Scans source files for doc URL strings, fetches each unique page once, checks that
every anchor referenced actually exists in the page HTML, and reports failures.

Usage:
python scripts/check_doc_urls.py [--repo-root PATH] [--delay SEC]

Exit codes:
0 All URLs valid
1 One or more broken/missing anchors found
2 Usage / dependency error

Why a Python script rather than a GTest network test
-----------------------------------------------------
BigLadder returns HTTP 200 for *any* URL on an existing page, regardless of whether
the anchor exists. A plain HTTP HEAD or GET check would silently pass even when an
anchor has been renamed or removed. Verifying anchor IDs requires fetching the full
page HTML and scanning for id="..." attributes — straightforward in Python with
html.parser, but awkward in C++/Qt without a full HTML parser dependency.

GTest network tests were also considered but ruled out because:
- They are slow and flaky in CI (network dependency).
- QNetworkAccessManager requires a running event loop and async handling.
- GTest provides no natural mechanism to fetch-and-parse HTML for anchor checks.

This script runs standalone (no build step), can be invoked as a pre-commit hook or
CI job, and completes in roughly one second per unique page fetched.
"""

import argparse
import re
import sys
import time
from collections import defaultdict
from html.parser import HTMLParser
from pathlib import Path
from urllib.request import urlopen
from urllib.error import URLError

# ---------------------------------------------------------------------------
# Files to scan and the regex patterns that extract URL fragments from them
# ---------------------------------------------------------------------------

# Matches values in the IddObjectDocUrl.hpp urlMap and groupMap:
# {"OS:Something", "group-foo.html#anchor"},
# {"OpenStudio Group Name", "group-foo.html"},
IDDOBJECTDOCURL_PATTERN = re.compile(
r'"(?:OS:|OpenStudio |Solar |Electric |Energy |User |Python |Airflow)[^"]*"\s*,\s*"([^"]+\.html(?:#[^"]*)?)"'
)

# Matches QString URL constructions in .cpp files:
# QString::fromStdString(openstudio::bigladdersoftwareDocBaseUrl()) + "page.html#anchor"
# or the old QStringLiteral(BASE "page.html#anchor") form
CPP_URL_FRAGMENT_PATTERN = re.compile(
r'(?:bigladdersoftwareDocBaseUrl\(\)\s*\+\s*"([^"]+\.html(?:#[^"]*)?)"'
r'|QStringLiteral\(BIGLADDERSOFTWARE_DOC_BASE_URL\s*"([^"]+\.html(?:#[^"]*)?)"'
r'|\+\s*"(group-[^"]+\.html(?:#[^"]*)?)"'
r'|\+\s*"(lifecyclecost-[^"]+\.html(?:#[^"]*)?)")'
)

SOURCE_FILES = [
"src/model_editor/IddObjectDocUrl.hpp",
"src/openstudio_lib/SimSettingsView.cpp",
"src/openstudio_lib/LoadsView.cpp",
"src/openstudio_lib/ConstructionsView.cpp",
"src/openstudio_lib/MaterialsView.cpp",
"src/openstudio_lib/ScheduleOthersView.cpp",
"src/openstudio_lib/LocationTabView.cpp",
"src/openstudio_lib/YearSettingsWidget.cpp",
"src/openstudio_lib/GroundTemperatureView.cpp",
"src/openstudio_lib/LifeCycleCostsTabView.cpp",
"src/openstudio_lib/BuildingInspectorView.cpp",
]

BIGLADDERSOFTWARE_BASE = "https://bigladdersoftware.com/epx/docs/25-1/input-output-reference/"


# ---------------------------------------------------------------------------
# HTML parser that collects all id= attributes
# ---------------------------------------------------------------------------

class AnchorCollector(HTMLParser):
def __init__(self):
super().__init__()
self.ids = set()

def handle_starttag(self, tag, attrs):
for name, value in attrs:
if name == "id" and value:
self.ids.add(value)


# ---------------------------------------------------------------------------
# URL extraction
# ---------------------------------------------------------------------------

def extract_fragments(repo_root: Path):
"""
Returns a dict: page_url -> list of (anchor_or_None, source_file, line_no)
"""
results = defaultdict(list)

for rel_path in SOURCE_FILES:
src = repo_root / rel_path
if not src.exists():
print(f" WARNING: {rel_path} not found, skipping", file=sys.stderr)
continue

text = src.read_text(encoding="utf-8")

if rel_path.endswith("IddObjectDocUrl.hpp"):
pattern = IDDOBJECTDOCURL_PATTERN
else:
pattern = CPP_URL_FRAGMENT_PATTERN

for lineno, line in enumerate(text.splitlines(), 1):
for m in pattern.finditer(line):
fragment = next((g for g in m.groups() if g), None)
if not fragment:
continue
if "#" in fragment:
page_part, anchor = fragment.split("#", 1)
else:
page_part, anchor = fragment, None
page_url = BIGLADDERSOFTWARE_BASE + page_part
results[page_url].append((anchor, rel_path, lineno))

return results


# ---------------------------------------------------------------------------
# Page fetching with simple cache
# ---------------------------------------------------------------------------

def fetch_anchors(url: str, delay: float = 0.5) -> set | None:
"""Fetch a page and return the set of id= values, or None on error."""
try:
time.sleep(delay)
with urlopen(url, timeout=15) as resp:
html = resp.read().decode("utf-8", errors="replace")
parser = AnchorCollector()
parser.feed(html)
return parser.ids
except URLError as e:
print(f" ERROR fetching {url}: {e}", file=sys.stderr)
return None


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--repo-root",
default=".",
help="Path to the OpenStudioApplication repo root (default: current directory)",
)
parser.add_argument(
"--delay",
type=float,
default=0.5,
help="Seconds to wait between page fetches (default: 0.5)",
)
args = parser.parse_args()

repo_root = Path(args.repo_root).resolve()
print(f"Scanning repo: {repo_root}")

fragments = extract_fragments(repo_root)
if not fragments:
print("No URLs found — check SOURCE_FILES list.", file=sys.stderr)
sys.exit(2)

print(f"\nFound {sum(len(v) for v in fragments.values())} URL references across {len(fragments)} unique pages.\n")

failures = []
page_cache = {}

for page_url in sorted(fragments):
print(f"Checking: {page_url}")
if page_url not in page_cache:
page_cache[page_url] = fetch_anchors(page_url, delay=args.delay)

page_ids = page_cache[page_url]

for anchor, src_file, lineno in fragments[page_url]:
if page_ids is None:
failures.append((src_file, lineno, page_url, anchor, "page fetch failed"))
elif anchor and anchor not in page_ids:
failures.append((src_file, lineno, page_url, anchor, "anchor not found in page"))
else:
status = "OK" if anchor else "OK (no anchor)"
print(f" {status}: #{anchor or ''}")

print()
if failures:
print(f"FAILURES ({len(failures)}):")
for src_file, lineno, page_url, anchor, reason in failures:
print(f" {src_file}:{lineno} #{anchor} -> {reason}")
print(f" {page_url}#{anchor or ''}")
sys.exit(1)
else:
print("All URLs OK.")
sys.exit(0)


if __name__ == "__main__":
main()
Loading
Loading