pypose · zitongzhan · Dec 13, 2025 · Dec 23, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ tmp_debug/*
 task.md
 *.pickle
 *.png
+.warp_cache/*
 .DS_Store
 tmp/*
 examples/module/pgo/data/*

diff --git a/ba_example.py b/ba_example.py
@@ -1,25 +1,46 @@
 from time import perf_counter
+from datetime import datetime
+from pathlib import Path
 
 import pypose as pp
 import torch
 import torch.nn as nn
+import warp as wp
 from pypose.autograd.function import psjac
 
 from datapipes.bal_loader import get_problem
-from bae.optim import LM
+from bae.optim.optimizer import Schur
+from bae.optim.triton_kernel import sparse_bsr_mv
 from bae.utils.pysolvers import PCG
 
 TARGET_DATASET = "trafalgar"
 TARGET_PROBLEM = "problem-257-65132-pre"
-# other options:
 # TARGET_DATASET = "ladybug"
 # TARGET_PROBLEM = "problem-1723-156502-pre"
 # TARGET_DATASET = "dubrovnik"
 # TARGET_PROBLEM = "problem-356-226730-pre"
+# TARGET_DATASET = "final"
+# TARGET_PROBLEM = "problem-13682-4456117-pre"
+# TARGET_DATASET = "venice"
+# TARGET_PROBLEM = "problem-1778-993923-pre"
 
 DEVICE = "cuda"
 OPTIMIZE_INTRINSICS = True
 NUM_CAMERA_PARAMS = 10 if OPTIMIZE_INTRINSICS else 7
+REPORT_WARP_MEMPOOL = True
+
+
+def _format_bytes(num_bytes: int) -> str:
+    sign = "-" if num_bytes < 0 else ""
+    size = float(abs(num_bytes))
+    units = ["B", "KiB", "MiB", "GiB", "TiB"]
+    for unit in units:
+        if size < 1024.0 or unit == units[-1]:
+            break
+        size /= 1024.0
+    if unit == "B":
+        return f"{sign}{int(size)} {unit}"
+    return f"{sign}{size:.2f} {unit}"
 
 
 @psjac
@@ -54,7 +75,51 @@ def least_square_error(camera_params, points, cidx, pidx, observes):
     return torch.sum(loss**2, dim=-1).mean()
 
 
+class TrustRegion(pp.optim.strategy.TrustRegion):
+    def update(self, pg, last, loss, J, D, R, *args, **kwargs):
+        Jwp = kwargs.get("Jwp")
+        if Jwp is not None:
+            J = Jwp
+
+        JD = None
+        for i in range(len(D)):
+            if Jwp is not None:
+                JD_i = sparse_bsr_mv(J[i], D[i].flatten().contiguous()).flatten()
+            else:
+                JD_i = J[i] @ D[i].flatten()
+            JD = JD_i if JD is None else JD + JD_i
+
+        JD = JD[..., None]
+        denom = -((JD).mT @ (2 * R.view_as(JD) + JD)).squeeze()
+
+        if loss >= last or denom <= 0:
+            quality = -1.0
+        else:
+            quality = (last - loss) / denom
+
+        pg['radius'] = 1.0 / pg['damping']
+        if quality > pg['high']:
+            pg['radius'] = pg['up'] * pg['radius']
+            pg['down'] = self.down
+        elif quality > pg['low']:
+            pg['radius'] = pg['radius']
+            pg['down'] = self.down
+        else:
+            pg['radius'] = pg['radius'] * pg['down']
+            pg['down'] = pg['down'] * pg['factor']
+        pg['down'] = max(self.min, min(pg['down'], self.max))
+        pg['radius'] = max(self.min, min(pg['radius'], self.max))
+        pg['damping'] = 1.0 / pg['radius']
+
+
 def main():
+    file_name = f"{TARGET_DATASET}.{TARGET_PROBLEM}"
+    cuda_device = torch.device(DEVICE) if DEVICE.startswith("cuda") else None
+    memory_snapshot_path = None
+    warp_device = None
+    warp_mempool_start_current = None
+    warp_mempool_start_high = None
+
     dataset = get_problem(TARGET_PROBLEM, TARGET_DATASET)
     print(f"Fetched {TARGET_PROBLEM} from {TARGET_DATASET}")
 
@@ -69,13 +134,37 @@ def main():
         "pidx": dataset["point_index_of_observations"],
     }
 
+    if cuda_device is not None and torch.cuda.is_available():
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        snapshot_dir = Path("memory_traces")
+        snapshot_dir.mkdir(exist_ok=True)
+        memory_snapshot_path = snapshot_dir / f"{file_name}_cuda_memory_{timestamp}.pickle"
+        torch.cuda.memory._record_memory_history(
+            enabled="all",
+            context="all",
+            stacks="python",
+            device=cuda_device,
+            clear_history=True,
+        )
+
+    if REPORT_WARP_MEMPOOL and DEVICE.startswith("cuda"):
+        try:
+            if wp.is_cuda_available():
+                warp_device = wp.get_device("cuda:0" if DEVICE == "cuda" else DEVICE)
+                if not wp.is_mempool_enabled(warp_device):
+                    wp.set_mempool_enabled(warp_device, True)
+                warp_mempool_start_current = wp.get_mempool_used_mem_current(warp_device)
+                warp_mempool_start_high = wp.get_mempool_used_mem_high(warp_device)
+        except Exception as e:
+            print(f"Warning: failed to query Warp mempool stats: {e}")
+
     model = Residual(
         dataset["camera_params"][:, :NUM_CAMERA_PARAMS].clone(),
         dataset["points_3d"].clone(),
     ).to(DEVICE)
-    strategy = pp.optim.strategy.TrustRegion(up=2.0, down=0.5**4)
+    strategy = TrustRegion(up=2.0, down=0.5**4)
     solver = PCG(tol=1e-4, maxiter=250)
-    optimizer = LM(model, strategy=strategy, solver=solver, reject=30)
+    optimizer = Schur(model, strategy=strategy, solver=solver, reject=30, matrix_free_normal=True)
 
     print('Loss:', least_square_error(
         model.pose,
@@ -87,15 +176,48 @@ def main():
 
     print("Initial loss", optimizer.model.loss(input, None).item())
 
+    if cuda_device is not None and torch.cuda.is_available():
+        torch.cuda.synchronize(cuda_device)
+        torch.cuda.reset_peak_memory_stats(cuda_device)
+
     start = perf_counter()
     for idx in range(20):
         loss = optimizer.step(input)
         print("Iteration", idx, "loss", loss.item(), "time", perf_counter() - start)
 
-    torch.cuda.synchronize()
+    if cuda_device is not None and torch.cuda.is_available():
+        torch.cuda.synchronize(cuda_device)
     end = perf_counter()
     print("Time", end - start)
 
+    if memory_snapshot_path:
+        torch.cuda.synchronize(cuda_device)
+        torch.cuda.memory._dump_snapshot(str(memory_snapshot_path))
+        print(f"CUDA memory snapshot saved to {memory_snapshot_path}")
+
+    if cuda_device is not None and torch.cuda.is_available():
+        peak_allocated = torch.cuda.max_memory_allocated(cuda_device)
+        try:
+            peak_reserved = torch.cuda.max_memory_reserved(cuda_device)
+        except AttributeError:
+            peak_reserved = torch.cuda.max_memory_cached(cuda_device)
+        print(f"Peak CUDA memory allocated: {_format_bytes(peak_allocated)}")
+        print(f"Peak CUDA memory reserved: {_format_bytes(peak_reserved)}")
+
+    if warp_device is not None and warp_mempool_start_current is not None:
+        try:
+            warp_current = wp.get_mempool_used_mem_current(warp_device)
+            warp_high = wp.get_mempool_used_mem_high(warp_device)
+            print(f"Warp CUDA mempool current: {_format_bytes(warp_current)} "
+                f"(Δ {_format_bytes(warp_current - warp_mempool_start_current)})"
+            )
+            print(
+                f"Warp CUDA mempool high-water: {_format_bytes(warp_high)} "
+                f"(Δ {_format_bytes(warp_high - warp_mempool_start_high)})"
+            )
+        except Exception as e:
+            print(f"Warning: failed to query Warp mempool stats: {e}")
+
     print('Ending loss:', least_square_error(
         model.pose,
         model.points,