PufferAI · Kinvert · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,4 @@ pufferlib/ocean/impulse_wars/*-release/
 pufferlib/ocean/impulse_wars/debug-*/
 pufferlib/ocean/impulse_wars/release-*/
 pufferlib/ocean/impulse_wars/benchmark/
+pufferlib/ocean/dogfight/dogfight_test
diff --git a/pufferlib/SWEEP_PERSISTENCE.md b/pufferlib/SWEEP_PERSISTENCE.md
diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini
@@ -28,24 +28,24 @@ device = cuda
 optimizer = muon
 anneal_lr = True
 precision = float32
-total_timesteps = 10_000_000
+total_timesteps = 100_000_000
 learning_rate = 0.015
 gamma = 0.995
-gae_lambda = 0.90
+gae_lambda = 0.95
 update_epochs = 1
 clip_coef = 0.2
 vf_coef = 2.0
 vf_clip_coef = 0.2
 max_grad_norm = 1.5
-ent_coef = 0.001
+ent_coef = 0.01
 adam_beta1 = 0.95
 adam_beta2 = 0.999
 adam_eps = 1e-12
 
 data_dir = experiments
 checkpoint_interval = 200
 batch_size = auto
-minibatch_size = 8192
+minibatch_size = 16384
 
 # Accumulate gradients above this size
 max_minibatch_size = 32768
@@ -58,7 +58,7 @@ vtrace_rho_clip = 1.0
 vtrace_c_clip = 1.0
 
 prio_alpha = 0.8
-prio_beta0 = 0.2
+prio_beta0 = 0.5
 
 [sweep]
 method = Protein 
@@ -75,24 +75,9 @@ prune_pareto = True
 #mean = 8
 #scale = auto
 
-# TODO: Elim from base
-[sweep.train.total_timesteps]
-distribution = log_normal
-min = 3e7
-max = 1e10
-mean = 2e8
-scale = time
-
-[sweep.train.bptt_horizon]
-distribution = uniform_pow2
-min = 16
-max = 64
-mean = 64
-scale = auto
-
 [sweep.train.minibatch_size]
 distribution = uniform_pow2
-min = 8192
+min = 16384
 max = 65536
 mean = 32768
 scale = auto

diff --git a/pufferlib/config/ocean/dogfight.ini b/pufferlib/config/ocean/dogfight.ini
@@ -0,0 +1,146 @@
+[base]
+env_name = puffer_dogfight
+package = ocean
+policy_name = Policy
+rnn_name = Recurrent
+
+[vec]
+num_envs = 8
+
+[env]
+reward_aim_scale = 0.05
+reward_closing_scale = 0.003
+penalty_neg_g = 0.02
+speed_min = 50.0
+
+max_steps = 3000
+num_envs = 1024
+obs_scheme = 1
+
+curriculum_enabled = 1
+curriculum_randomize = 0
+advance_threshold = 0.7
+
+[train]
+adam_beta1 = 0.9768629406862324
+adam_beta2 = 0.999302214750495
+adam_eps = 6.906760212075045e-12
+batch_size = auto
+bptt_horizon = 64
+checkpoint_interval = 200
+clip_coef = 0.4928184678032994
+ent_coef = 0.008
+gae_lambda = 0.8325103714810463
+gamma = 0.8767105842751813
+learning_rate = 0.00024
+max_grad_norm = 0.831714766100049
+max_minibatch_size = 65536
+minibatch_size = 65536
+prio_alpha = 0.8195880336315146
+prio_beta0 = 0.9429570720846501
+seed = 42
+total_timesteps = 400_000_000
+update_epochs = 4
+vf_clip_coef = 3.2638480501249436
+vf_coef = 4.293249868787825
+vtrace_c_clip = 1.911078435368836
+vtrace_rho_clip = 3.797866655513644
+
+[sweep]
+downsample = 1
+goal = maximize
+method = Protein
+metric = ultimate
+prune_pareto = True
+use_gpu = True
+
+[sweep.env.reward_aim_scale]
+distribution = uniform
+min = 0.02
+max = 0.1
+mean = 0.05
+scale = auto
+
+[sweep.env.reward_closing_scale]
+distribution = uniform
+min = 0.001
+max = 0.01
+mean = 0.003
+scale = auto
+
+[sweep.env.penalty_neg_g]
+distribution = uniform
+min = 0.01
+max = 0.05
+mean = 0.02
+scale = auto
+
+[sweep.env.obs_scheme]
+distribution = int_uniform
+max = 5
+mean = 0
+min = 0
+scale = 1.0
+
+[sweep.env.advance_threshold]
+distribution = uniform
+min = 0.5
+max = 0.85
+mean = 0.7
+scale = auto
+
+[sweep.env.max_steps]
+distribution = int_uniform
+min = 300
+max = 1500
+mean = 900
+scale = 1.0
+
+[sweep.train.learning_rate]
+distribution = log_normal
+max = 0.0005
+mean = 0.00025
+min = 0.0001
+scale = 0.5
+
+[sweep.train.vf_coef]
+distribution = uniform
+min = 1.0
+max = 5.0
+mean = 3.0
+scale = auto
+
+[sweep.train.clip_coef]
+distribution = uniform
+min = 0.3
+max = 1.0
+mean = 0.5
+scale = auto
+
+[sweep.train.ent_coef]
+distribution = log_normal
+min = 0.002
+max = 0.02
+mean = 0.008
+scale = 0.5
+
+[sweep.train.max_grad_norm]
+distribution = uniform
+min = 0.5
+max = 2.0
+mean = 1.0
+scale = auto
+
+[sweep.train.gae_lambda]
+distribution = logit_normal
+min = 0.9
+max = 0.999
+mean = 0.95
+scale = auto
+
+[sweep.train.gamma]
+distribution = logit_normal
+min = 0.95
+max = 0.9999
+mean = 0.99
+scale = auto
diff --git a/pufferlib/environments/mani_skill/torch.py b/pufferlib/environments/mani_skill/torch.py
@@ -64,7 +64,7 @@ def decode_actions(self, hidden):
         '''Decodes a batch of hidden states into (multi)discrete actions.
         Assumes no time dimension (handled by LSTM wrappers).'''
         mean = self.decoder_mean(hidden)
-        logstd = self.decoder_logstd.expand_as(mean)
+        logstd = self.decoder_logstd.expand_as(mean).clamp(min=-20, max=2)
         std = torch.exp(logstd)
         logits = torch.distributions.Normal(mean, std)
         values = self.value(hidden)

diff --git a/pufferlib/models.py b/pufferlib/models.py
@@ -88,7 +88,7 @@ def decode_actions(self, hidden):
             logits = self.decoder(hidden).split(self.action_nvec, dim=1)
         elif self.is_continuous:
             mean = self.decoder_mean(hidden)
-            logstd = self.decoder_logstd.expand_as(mean)
+            logstd = self.decoder_logstd.expand_as(mean).clamp(min=-20, max=2)
             std = torch.exp(logstd)
             logits = torch.distributions.Normal(mean, std)
         else: