Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions configs/prune_llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,19 @@ Configurations for generating results in the SCAR LLM pruning paper.

## Quick Start

Run all experiments:
Run single model:
```bash
bash slurm_jobs/prune_llm/run_all_paper.sh
python scripts/run_experiment.py --config configs/prune_llm/llama3_8b_unified.yaml
```

Run single model:
Paper batch launchers now live under:
```bash
python scripts/run_experiment.py --config configs/prune_llm/llama3_8b_unified.yaml
drafts/LLM_prune/paper/slurm_jobs/
```

See:
```bash
drafts/LLM_prune/paper/slurm_jobs/README.md
```

Override base output directory:
Expand Down
146 changes: 146 additions & 0 deletions configs/prune_llm/llama3_70b_scale_benchmarks_50_papersafe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# ============================================================================
# LLAMA-3.1-70B PAPER-SAFE BENCHMARK RERUN: 50% FFN PRUNING + 8B-MATCHED TASKS
# ============================================================================
#
# Purpose:
# - Upgrade the preliminary 70B benchmark table from 100 samples/task to a more
# defensible evaluation budget for main-paper use.
# - Match the 8B benchmark suite more closely by including ARC-E and OBQA.
# - Keep the method set focused on the main comparisons that matter in the paper.
#
# Recommended use:
# - Promote 70B task results into the main paper only after this rerun finishes.
# ============================================================================

experiment:
name: "llama3_70b_scale_benchmarks_50_papersafe"
type: "llm_alignment"
output_dir: "./results/paper/llama3_70b_scale_benchmarks_50_papersafe"
seed: 42
device: "cuda"
save_activations: false
num_networks: 1

model:
name: "hf_causal_lm"
model_id: "meta-llama/Llama-3.1-70B"
dtype: "bfloat16"
device_map: "auto"
trust_remote_code: true
tracked_layers:
- "model.model.layers.*.mlp.up_proj"
- "model.model.layers.*.mlp.gate_proj"
- "model.model.layers.*.mlp.down_proj"

dataset:
name: "wikitext"
batch_size: 1
num_workers: 0

calibration:
dataset: "wikitext"
subset: "wikitext-2-raw-v1"
split: "train"
num_samples: 64
max_length: 512
batch_size: 1

metrics:
enabled:
- "activation_l2_norm"
num_samples: 64

do_scar_metrics: true
scar_num_samples: 64
scar_max_length: 512

llm:
scar_metrics: true
scar_num_samples: 64
scar_max_length: 512
evaluate_perplexity: true
evaluation_num_samples: 500
use_nvidia_fewshot: true
perplexity_protocol: "oats"
wikitext_subset: "wikitext-2-raw-v1"
perplexity_seq_len: 2048

evaluation_metrics:
- "perplexity"
- "accuracy_mmlu"
- "accuracy_hellaswag"
- "accuracy_piqa"
- "accuracy_boolq"
- "accuracy_winogrande"
- "accuracy_arc_easy"
- "accuracy_arc_challenge"
- "accuracy_openbookqa"

analysis:
generate_plots: false
save_scores: true

generate_plots: false
save_scores: true

do_connectivity_pruning: true
do_directed_redundancy: false
do_halo_analysis: false
do_generalized_importance: false

supernode:
enabled: true
score_metric: "scar_loss_proxy"
core_fraction: 0.01
follower_fraction: 0.10
halo_fraction: 0.10
connectivity_topk: 256
connectivity_rank_normalize: false
connectivity_power: 1.0
protect_core: true
protect_core_metrics:
- "scar_loss_proxy"
- "supernode_protection_score"
- "supernode_connectivity_score"
cross_layer_analysis: false
compare_by_connection: true
compute_metrics:
- "activation"

supernode_robustness:
enabled: false

supernode_summary:
enabled: false
outlier_analysis: false

halo_analysis:
enabled: false

cross_layer:
enabled: false

generalized_importance:
enabled: false

pruning:
enabled: true
target: "ffn"
structured: true
dependency_aware: true
distribution: "uniform"
min_per_layer: 0.0
max_per_layer: 0.95

sparsity_levels: [0.5]
selection_modes: ["low"]
algorithms:
- "scar_loss_proxy"
- "supernode_protection_score"
- "supernode_connectivity_score"
- "activation_l2_norm"
- "weight_magnitude"
- "wanda"

evaluation:
enabled: true
126 changes: 126 additions & 0 deletions configs/prune_llm/llama3_70b_scale_mechanism.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# ============================================================================
# LLAMA-3.1-70B SCALE CHECK: MECHANISM / CONCENTRATION ONLY
# ============================================================================
#
# Purpose:
# - Test whether the main supernode concentration phenomenon persists at 70B.
# - Keep the run limited to the reviewer-relevant scale question:
# - LP concentration / supernode identification
# - optional outlier summary
# - Skip pruning sweeps, downstream tasks, halo controls, and true ablation probes.
#
# This is designed as a rebuttal-focused replication, not a full paper rerun.
# ============================================================================

experiment:
name: "llama3_70b_scale_mechanism"
type: "llm_alignment"
output_dir: "./results/paper/llama3_70b_scale_mechanism"
seed: 42
device: "cuda"
save_activations: false
num_networks: 1

model:
name: "hf_causal_lm"
model_id: "meta-llama/Llama-3.1-70B"
dtype: "bfloat16"
device_map: "auto"
trust_remote_code: true
tracked_layers:
- "model.model.layers.*.mlp.up_proj"
- "model.model.layers.*.mlp.gate_proj"
- "model.model.layers.*.mlp.down_proj"

dataset:
name: "wikitext"
batch_size: 1
num_workers: 0

calibration:
dataset: "wikitext"
subset: "wikitext-2-raw-v1"
split: "train"
num_samples: 64
max_length: 512
batch_size: 1

metrics:
enabled:
- "rayleigh_quotient"
num_samples: 64
rayleigh_quotient:
relative: true
regularization: 1.0e-6

do_scar_metrics: true
scar_num_samples: 64
scar_max_length: 512

llm:
scar_metrics: true
scar_num_samples: 64
scar_max_length: 512
evaluate_perplexity: false
evaluation_metrics: []
wikitext_subset: "wikitext-2-raw-v1"

analysis:
generate_plots: false
save_scores: true

generate_plots: false
save_scores: true

do_connectivity_pruning: false
do_directed_redundancy: false
do_halo_analysis: false
do_generalized_importance: false

supernode:
enabled: true
score_metric: "scar_loss_proxy"
core_fraction: 0.01
follower_fraction: 0.10
halo_fraction: 0.10
connectivity_topk: 256
connectivity_rank_normalize: false
connectivity_power: 1.0
protect_core: true
protect_core_metrics:
- "scar_loss_proxy"
cross_layer_analysis: false
compare_by_connection: false
compute_metrics:
- "activation"

read_halo_analysis:
enabled: false

conditional_halo_ablation:
enabled: false

lp_ablation_validation:
enabled: false

supernode_robustness:
enabled: false

supernode_summary:
enabled: true
outlier_analysis: true

halo_analysis:
enabled: false

cross_layer:
enabled: false

generalized_importance:
enabled: false

pruning:
enabled: false

evaluation:
enabled: false
Loading
Loading