MMLU gemma
nonselect
- global 0.2017
- layer 24: 17.58 (61/347)
- layer 24 with bias: 11.88 (69/581)
- validation filtered: 0.1717 (132/769)
select
higher since we don't have
- global 0.3843 (269/700)
- layer 0.5294 (99/187)
- layer decode 0.4633 (101/218)
BBQ gemma
- ambig global 0.0755
- ambig layer 0
- ambig foreach 0 (0/1532)
- disambig global 0.2573
- disambig layer 0.1667
- disambig forach 0.1530
GSM
- 84/186 = 0.4516 이건 cot + steer 효과로 과소평가댐
- cot 분해해서 cot 의 SER 재보자
- real global - SER=0.7386 ( Pos=86, Neg=243)
Harmbench
- 0.0429 global
- 0.2500 layer
- 0.1429 foreach
python train.py train --layer=global --example --task=bbq --filter_value=ambig --layer=foreach && python train.py train --layer=global --example --task=bbq --filter_value=disambig --layer=foreach python train.py train --layer=foreach --example --validate
# BBQ # ambig python train.py train --model=gemma2b --task=bbq --layer=foreach --example --filter_value=ambig python train.py train --model=gemma2b --task=bbq --layer=global --example --filter_value=ambig python train.py train --model=gemma2b --task=bbq --layer=foreach --example --filter_value=ambig --few=1 python train.py train --model=gemma2b --task=bbq --layer=global --example --filter_value=ambig --few=1 # disambig python train.py train --model=gemma2b --task=bbq --layer=foreach --example --filter_value=disambig python train.py train --model=gemma2b --task=bbq --layer=global --example --filter_value=disambig python train.py train --model=gemma2b --task=bbq --layer=foreach --example --filter_value=disambig --few=1 python train.py train --model=gemma2b --task=bbq --layer=global --example --filter_value=disambig --few=1 # MMLU python train.py train --model=gemma2b --task=mmlu --layer=foreach --example python train.py train --model=gemma2b --task=mmlu --layer=global --example # SimpleQA python train.py train --model=gemma2b --task=simpleqa --layer=foreach --example python train.py train --model=gemma2b --task=simpleqa --layer=global --example # GSM8K python train.py train --model=gemma2b --task=gsm8k --layer=foreach --example --steer_pool=mean python train.py train --model=gemma2b --task=gsm8k --layer=global --example # HarmBench python train.py train --model=gemma2b --task=harmbench --layer=foreach --example python train.py train --model=gemma2b --task=harmbench --layer=global --example # XSTest python train.py train --model=gemma2b --task=xstest --layer=foreach --example python train.py train --model=gemma2b --task=xstest --layer=global --example python train.py train --model=gemma2b --task=xstest --layer=foreach --example --few=1 python train.py train --model=gemma2b --task=xstest --layer=global --example --few=1 # WMDP python train.py train --model=gemma2b --task=wmdp --layer=foreach --example python train.py train --model=gemma2b --task=wmdp --layer=global --example
# BBQ # ambig python train.py train --model=llama8 --task=bbq --layer=foreach --example --filter_value=ambig python train.py train --model=llama8 --task=bbq --layer=global --example --filter_value=ambig python train.py train --model=llama8 --task=bbq --layer=foreach --example --filter_value=ambig --few=1 python train.py train --model=llama8 --task=bbq --layer=global --example --filter_value=ambig --few=1 # disambig python train.py train --model=llama8 --task=bbq --layer=foreach --example --filter_value=disambig python train.py train --model=llama8 --task=bbq --layer=global --example --filter_value=disambig python train.py train --model=llama8 --task=bbq --layer=foreach --example --filter_value=disambig --few=1 python train.py train --model=llama8 --task=bbq --layer=global --example --filter_value=disambig --few=1 # MMLU python train.py train --model=llama8 --task=mmlu --layer=foreach --example python train.py train --model=llama8 --task=mmlu --layer=global --example # SimpleQA python train.py train --model=llama8 --task=simpleqa --layer=foreach --example python train.py train --model=llama8 --task=simpleqa --layer=global --example # GSM8K python train.py train --model=llama8 --task=gsm8k --layer=foreach --example --steer_pool=mean python train.py train --model=llama8 --task=gsm8k --layer=global --example # HarmBench python train.py train --model=llama8 --task=harmbench --layer=foreach --example python train.py train --model=llama8 --task=harmbench --layer=global --example # XSTest python train.py train --model=llama8 --task=xstest --layer=foreach --example python train.py train --model=llama8 --task=xstest --layer=global --example # WMDP python train.py train --model=llama8 --task=wmdp --layer=foreach --example python train.py train --model=llama8 --task=wmdp --layer=global --example
# GSM8K python train.py train --task=gsm8k --layer=foreach --example --steer_pool=mean # XSTest python train.py train --model=gemma2b --task=xstest --layer=global --example python train.py train --model=gemma2b --task=xstest --layer=foreach --example # WMDP python train.py train --model=gemma2b --task=wmdp --layer=foreach --example python train.py train --mode=gemma2b --task=wmdp --layer=global --example
gemma2b_mmlu_multi_25_select pos neg total SER pos% neg% gemma2b_bbq_ambig_17_11021_7 658 0 658 0.0000 1.0000 0.0000 gemma2b_bbq_ambig_multi_15 1532 0 1532 0.0000 1.0000 0.0000 gemma2b_bbq_ambig_multi_25 649 53 702 0.0755 0.9245 0.0755 gemma2b_bbq_disambig_17_5137_8 80 16 96 0.1667 0.8333 0.1667 gemma2b_bbq_disambig_multi_25 355 123 478 0.2573 0.7427 0.2573 gemma2b_gsm8k_multi_25 102 84 186 0.4516 0.5484 0.4516 gemma2b_harmbench_15_1570_24 24 4 28 0.1429 0.8571 0.1429 gemma2b_harmbench_7_11722_5 6 2 8 0.2500 0.7500 0.2500 gemma2b_harmbench_multi_25 67 3 70 0.0429 0.9571 0.0429 gemma2b_mmlu_24_16355_37 286 61 347 0.1758 0.8242 0.1758 gemma2b_mmlu_24_16355_37_decode 512 69 581 0.1188 0.8812 0.1188 gemma2b_mmlu_24_16355_37_select 88 99 187 0.5294 0.4706 0.5294 gemma2b_mmlu_24_16355_37_select_decode 117 101 218 0.4633 0.5367 0.4633 gemma2b_mmlu_multi_17 637 132 769 0.1717 0.8283 0.1717 gemma2b_mmlu_multi_25 768 194 962 0.2017 0.7983 0.2017 gemma2b_mmlu_multi_25_select 431 269 700 0.3843 0.6157 0.3843
import os, json, glob, sys def load_len(path): try: with open(path) as f: return len(json.load(f)) except Exception: return 0 out_dir = sys.argv[1] if len(sys.argv) > 1 else "output" pos_files = {os.path.basename(p).replace("_positive.json", ""): p for p in glob.glob(os.path.join(out_dir, "*_positive.json"))} neg_files = {os.path.basename(p).replace("_negative.json", ""): p for p in glob.glob(os.path.join(out_dir, "*_negative.json"))} metric_files = {os.path.basename(p).replace("_metrics.json", ""): p for p in glob.glob(os.path.join(out_dir, "*_metrics.json"))} rows = [] # Prefer metrics files if available for name, mpath in metric_files.items(): try: with open(mpath) as f: m = json.load(f) pos = int(m.get("pos_count", 0)) neg = int(m.get("neg_count", 0)) total = pos + neg ser = m.get("ser", (neg / total if total else None)) pr = m.get("pos_ratio", (pos / total if total else None)) nr = m.get("neg_ratio", (neg / total if total else None)) rows.append((name, pos, neg, total, ser, pr, nr)) except Exception: pass # Derive for names that lack metrics for name in sorted(set(pos_files) | set(neg_files)): if any(name == r[0] for r in rows): continue pos = load_len(pos_files.get(name)) if pos_files.get(name) else 0 neg = load_len(neg_files.get(name)) if neg_files.get(name) else 0 total = pos + neg ser = (neg / total) if total else None pr = (pos / total) if total else None nr = (neg / total) if total else None rows.append((name, pos, neg, total, ser, pr, nr)) print(f"{name:60} pos neg total SER pos% neg%") for name, pos, neg, total, ser, pr, nr in sorted(rows): ser_s = f"{ser:.4f}" if ser is not None else "N/A" pr_s = f"{pr:.4f}" if pr is not None else "N/A" nr_s = f"{nr:.4f}" if nr is not None else "N/A" print(f"{name:60} {pos:4d} {neg:4d} {total:6d} {ser_s:6} {pr_s:6} {nr_s:6}")
Seonglae Cho