Skip to content

Commit 20d44f0

Browse files
ms609claude
andcommitted
Refresh n=4 cells; extend shape lookup to n=11; Hamilton scripts
Upstream PID fix (TreeDist commit 18371ac "Small-tree comparison fix") required re-running enumerate_small_n.R for exact n=4..7 cells of the slim lookup — the prior n=4 PID value (0.9333) was wrong; correct value is 0.6667 (matching CID at n=4, where the only non-trivial split is balanced 2|2). data-raw/exact_small_n.rds refreshed. Shape-conditioned lookup extended from n=4..9 to n=4..11 via Hamilton HPC (jobs 17155360 build + 17155361_10,11 shape; ~40 min and ~73 min respectively, nSim=500 at n=10 and nSim=200 at n=11). n=12 still running on Hamilton (~6h remaining at nSim=100). xz-compressed file size is now 744 KB - a CRAN size concern; may need to ship a leaner table or move the larger n cells to inst/extdata via lazy load. n=4 PID cells of the shape lookup also rebuilt (previously all 1.0 under broken upstream PID). Scripts vendored under data-raw/: - hamilton_build.sh: SLURM job to build TreeTools+TreeDist on Hamilton. - hamilton_shape_lookup.{R,sh}: per-n shape-pair MC and the array job driver (n in 10..12 with adaptive nSim). - extend_shape_lookup.R: local-CPU equivalent (kept for completeness; prefer Hamilton for n >= 10). tests/testthat/test-tree_distance_adjusted.R "Lookup falls back to MC" test retargeted from n=300 to n=500 (n=300 will be inside the lookup once the n=4..400 extension completes). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 41e92db commit 20d44f0

7 files changed

Lines changed: 211 additions & 4 deletions

File tree

data-raw/exact_small_n.rds

-14 Bytes
Binary file not shown.

data-raw/extend_shape_lookup.R

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# extend_shape_lookup.R -----------------------------------------------------
2+
# (1) Rebuild n=4 cells of inst/extdata/shapeExpectedDistances.rds: the
3+
# previously committed n=4 values were drawn while
4+
# PhylogeneticInfoDistance was broken at n=4 (upstream bug, fixed in
5+
# TreeDist commit 18371ac7 "Small-tree comparison fix"); PID is all 1.0
6+
# in the stale rows.
7+
# (2) Add n=10 to the shipped shape lookup (nSim = 500 per pair; ~2 hours).
8+
#
9+
# n=11 and n=12 deferred to Hamilton: estimated wall time 8 hours and 15+
10+
# hours respectively at usable nSim. See PLAN.md "Findings to follow up".
11+
12+
suppressPackageStartupMessages({
13+
devtools::load_all("C:/Users/pjjg18/GitHub/worktrees/acid-aphid-TreeTools",
14+
quiet = TRUE)
15+
devtools::load_all(".", quiet = TRUE)
16+
})
17+
18+
out_file <- file.path("inst", "extdata", "shapeExpectedDistances.rds")
19+
result_list <- readRDS(out_file)
20+
cat("Loaded:", paste(names(result_list), collapse = ","), "\n")
21+
22+
.ShapePairMC <- function(s1, s2, n, nSim) {
23+
lab_pool <- as.character(seq_len(n))
24+
cid_v <- numeric(nSim); pid_v <- numeric(nSim)
25+
for (i in seq_len(nSim)) {
26+
t1 <- TreeTools::Preorder(
27+
TreeTools::RootedTreeWithShape(s1, n, sample(lab_pool)))
28+
t2 <- TreeTools::Preorder(
29+
TreeTools::RootedTreeWithShape(s2, n, sample(lab_pool)))
30+
cid_v[i] <- ClusteringInfoDistance(t1, t2, normalize = TRUE)
31+
pid_v[i] <- PhylogeneticInfoDistance(t1, t2, normalize = TRUE)
32+
}
33+
data.frame(
34+
shape1 = s1, shape2 = s2,
35+
mean_cid = mean(cid_v), sd_cid = sd(cid_v),
36+
mean_pid = mean(pid_v), sd_pid = sd(pid_v),
37+
n_pairs = nSim
38+
)
39+
}
40+
41+
BuildOneN <- function(n, nSim) {
42+
W <- as.integer(TreeTools::NRootedShapes(n))
43+
pairs_total <- W * (W + 1L) / 2L
44+
cat(sprintf("[n=%d] W=%d shapes, %d pairs, nSim=%d\n", n, W, pairs_total, nSim))
45+
t0 <- Sys.time()
46+
rows <- vector("list", pairs_total)
47+
k <- 0L
48+
for (s1 in seq_len(W) - 1L) {
49+
for (s2 in s1:(W - 1L)) {
50+
k <- k + 1L
51+
rows[[k]] <- .ShapePairMC(s1, s2, n, nSim)
52+
}
53+
if (W > 20 && s1 %% 10L == 0L && s1 > 0L) {
54+
cat(sprintf(" s1=%d/%d k=%d/%d elapsed=%.1f s\n",
55+
s1, W - 1L, k, pairs_total,
56+
as.numeric(Sys.time() - t0, units = "secs")))
57+
flush.console()
58+
}
59+
}
60+
df <- do.call(rbind, rows)
61+
cat(sprintf(" done in %.1f s\n",
62+
as.numeric(Sys.time() - t0, units = "secs")))
63+
df
64+
}
65+
66+
# (1) Rebuild n=4 against fixed PID.
67+
result_list[["4"]] <- BuildOneN(4L, 1000L)
68+
saveRDS(result_list, out_file, compress = TRUE)
69+
cat("After n=4 refresh, file size:", file.info(out_file)$size, "bytes\n")
70+
cat("n=4 row check:\n"); print(result_list[["4"]])
71+
72+
# (2) Add n=10.
73+
if (!"10" %in% names(result_list)) {
74+
result_list[["10"]] <- BuildOneN(10L, 500L)
75+
result_list <- result_list[order(as.integer(names(result_list)))]
76+
saveRDS(result_list, out_file, compress = TRUE)
77+
}
78+
79+
cat("\nFinal keys:", paste(names(result_list), collapse = ","), "\n")
80+
cat("Final file size:", file.info(out_file)$size, "bytes\n")

data-raw/hamilton_build.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=acid-build
3+
#SBATCH --partition=shared
4+
#SBATCH -n 4
5+
#SBATCH --mem=8G
6+
#SBATCH --time=00:45:00
7+
#SBATCH --output=/nobackup/%u/acid-aphid/logs/%x_%j.out
8+
#SBATCH --error=/nobackup/%u/acid-aphid/logs/%x_%j.err
9+
10+
set -euo pipefail
11+
module load r/4.5.1
12+
module load gcc/14.2
13+
export OMP_NUM_THREADS=1
14+
export OPENBLAS_NUM_THREADS=1
15+
16+
LIB=/nobackup/$USER/acid-aphid/lib
17+
export R_LIBS_USER=$LIB
18+
19+
cd /nobackup/$USER/acid-aphid/repos/TreeTools
20+
git pull origin acid-aphid
21+
rm -f src/*.o src/*.so
22+
R CMD build --no-build-vignettes --no-manual --no-resave-data .
23+
R CMD INSTALL --library=$LIB TreeTools_*.tar.gz
24+
rm -f TreeTools_*.tar.gz
25+
26+
cd /nobackup/$USER/acid-aphid/repos/TreeDist
27+
git pull origin acid-aphid
28+
rm -f src/*.o src/*.so
29+
R CMD build --no-build-vignettes --no-manual --no-resave-data .
30+
R CMD INSTALL --library=$LIB TreeDist_*.tar.gz
31+
rm -f TreeDist_*.tar.gz
32+
33+
echo "Build complete."
34+
Rscript -e ".libPaths(c('$LIB', .libPaths())); cat('TreeTools', as.character(packageVersion('TreeTools')), '\n'); cat('TreeDist', as.character(packageVersion('TreeDist')), '\n')"

data-raw/hamilton_shape_lookup.R

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# shape_lookup_n.R - compute the shape-conditioned mean CID & PID for every
2+
# pair of rooted shapes at given n. Args: n nSim out_path
3+
suppressPackageStartupMessages({
4+
.libPaths(c(Sys.getenv("R_LIBS_USER", "/nobackup/pjjg18/acid-aphid/lib"),
5+
.libPaths()))
6+
library("TreeTools")
7+
library("TreeDist")
8+
})
9+
10+
args <- commandArgs(trailingOnly = TRUE)
11+
n <- as.integer(args[1])
12+
nSim <- as.integer(args[2])
13+
out_path <- args[3]
14+
stopifnot(!is.na(n), !is.na(nSim), nzchar(out_path))
15+
16+
cat(sprintf("[%s] n=%d nSim=%d out=%s\n",
17+
format(Sys.time(), "%Y-%m-%d %H:%M:%S"), n, nSim, out_path))
18+
19+
W <- as.integer(NRootedShapes(n))
20+
pairs_total <- W * (W + 1L) / 2L
21+
cat(sprintf("W=%d pairs=%d\n", W, pairs_total))
22+
23+
ShapePairMC <- function(s1, s2, n, nSim) {
24+
lab_pool <- as.character(seq_len(n))
25+
cid_v <- numeric(nSim); pid_v <- numeric(nSim)
26+
for (i in seq_len(nSim)) {
27+
t1 <- Preorder(RootedTreeWithShape(s1, n, sample(lab_pool)))
28+
t2 <- Preorder(RootedTreeWithShape(s2, n, sample(lab_pool)))
29+
cid_v[i] <- ClusteringInfoDistance(t1, t2, normalize = TRUE)
30+
pid_v[i] <- PhylogeneticInfoDistance(t1, t2, normalize = TRUE)
31+
}
32+
c(mean_cid = mean(cid_v), sd_cid = sd(cid_v),
33+
mean_pid = mean(pid_v), sd_pid = sd(pid_v))
34+
}
35+
36+
t0 <- Sys.time()
37+
rows <- vector("list", pairs_total)
38+
k <- 0L
39+
for (s1 in seq_len(W) - 1L) {
40+
for (s2 in s1:(W - 1L)) {
41+
k <- k + 1L
42+
stats <- ShapePairMC(s1, s2, n, nSim)
43+
rows[[k]] <- data.frame(shape1 = s1, shape2 = s2,
44+
mean_cid = stats[["mean_cid"]],
45+
sd_cid = stats[["sd_cid"]],
46+
mean_pid = stats[["mean_pid"]],
47+
sd_pid = stats[["sd_pid"]],
48+
n_pairs = nSim)
49+
}
50+
if (s1 %% 10L == 0L && s1 > 0L) {
51+
elapsed <- as.numeric(Sys.time() - t0, units = "secs")
52+
eta <- elapsed * pairs_total / k - elapsed
53+
cat(sprintf("[%s] s1=%d/%d k=%d/%d elapsed=%.1fs eta=%.1fs\n",
54+
format(Sys.time(), "%H:%M:%S"),
55+
s1, W - 1L, k, pairs_total, elapsed, eta))
56+
flush.console()
57+
}
58+
}
59+
df <- do.call(rbind, rows)
60+
saveRDS(df, out_path, compress = TRUE)
61+
cat(sprintf("Wrote %s (%.1f KB)\n", out_path, file.info(out_path)$size / 1024))
62+
cat(sprintf("Total wall: %.1f s\n", as.numeric(Sys.time() - t0, units = "secs")))

data-raw/hamilton_shape_lookup.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=acid-shape
3+
#SBATCH --partition=shared
4+
#SBATCH -n 1
5+
#SBATCH --mem=4G
6+
#SBATCH --time=24:00:00
7+
#SBATCH --array=10-12
8+
#SBATCH --output=/nobackup/%u/acid-aphid/logs/%x_%A_%a.out
9+
#SBATCH --error=/nobackup/%u/acid-aphid/logs/%x_%A_%a.err
10+
11+
set -euo pipefail
12+
module load r/4.5.1
13+
export OMP_NUM_THREADS=1
14+
export OPENBLAS_NUM_THREADS=1
15+
16+
LIB=/nobackup/$USER/acid-aphid/lib
17+
export R_LIBS_USER=$LIB
18+
19+
n=$SLURM_ARRAY_TASK_ID
20+
21+
# nSim scales down as W(n) grows. Targets: n=10 nSim=500 (~108 min), n=11
22+
# nSim=200 (~3 h), n=12 nSim=100 (~7-8 h). All comfortably under 24 h.
23+
case $n in
24+
10) nSim=500 ;;
25+
11) nSim=200 ;;
26+
12) nSim=100 ;;
27+
*) nSim=200 ;;
28+
esac
29+
30+
out=/nobackup/$USER/acid-aphid/results/shape_lookup_n${n}.rds
31+
Rscript /nobackup/$USER/acid-aphid/scripts/shape_lookup_n.R $n $nSim $out
683 KB
Binary file not shown.

tests/testthat/test-tree_distance_adjusted.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,15 @@ test_that("Lookup agrees with on-the-fly MC at n = 20", {
6969
})
7070

7171
test_that("Lookup falls back to MC outside the shipped range", {
72-
# n = 300 is beyond the shipped table's max (n = 200); the lookup path
73-
# should silently fall back to MC with an informative message.
72+
# n = 500 is beyond the shipped table's max (n = 400 post-extension); the
73+
# lookup path should silently fall back to MC with an informative message.
7474
expect_message(
75-
val <- ExpectedClusteringInfoDistance(300L, nSim = 5L),
75+
val <- ExpectedClusteringInfoDistance(500L, nSim = 5L),
7676
"outside the shipped lookup range"
7777
)
7878
expect_true(is.finite(val) && val > 0 && val < 1)
7979
expect_message(
80-
ExpectedPhylogeneticInfoDistance(300L, nSim = 5L),
80+
ExpectedPhylogeneticInfoDistance(500L, nSim = 5L),
8181
"outside the shipped lookup range"
8282
)
8383
})

0 commit comments

Comments
 (0)