diff --git a/.gitignore b/.gitignore index dbd6338..83ed498 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,11 @@ *.pyc +*.pt build/ dist/ .idea *.egg-info/ *.safetensors -outputs/ \ No newline at end of file +outputs/ +.cache/ +data/ +results/ \ No newline at end of file diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..acf2ea0 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,2 @@ +wbits = [2, 3, 4, 8] +sparsity = [0.0, 0.5, 0.9] \ No newline at end of file diff --git a/datautils.py b/datautils.py index 045121a..a269a22 100644 --- a/datautils.py +++ b/datautils.py @@ -1,10 +1,12 @@ import numpy as np import torch + def set_seed(seed): np.random.seed(seed) torch.random.manual_seed(seed) + def get_wikitext2(nsamples, seed, seqlen, model): from datasets import load_dataset traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') @@ -97,66 +99,6 @@ def __init__(self, input_ids): return trainloader, valenc -def get_ptb_new(nsamples, seed, seqlen, model): - from datasets import load_dataset - traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') - testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt') - testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt') - - import random - random.seed(seed) - trainloader = [] - for _ in range(nsamples): - i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = trainenc.input_ids[:, i:j] - tar = inp.clone() - tar[:, :-1] = -100 - trainloader.append((inp, tar)) - return trainloader, testenc - -def get_c4_new(nsamples, seed, seqlen, model): - from datasets import load_dataset - traindata = load_dataset( - 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train' - ) - valdata = load_dataset( - 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation' - ) - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - - import random - random.seed(seed) - trainloader = [] - for _ in range(nsamples): - while True: - i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') - if trainenc.input_ids.shape[1] >= seqlen: - break - i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = trainenc.input_ids[:, i:j] - tar = inp.clone() - tar[:, :-1] = -100 - trainloader.append((inp, tar)) - - valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt') - valenc = valenc.input_ids[:, :(256 * seqlen)] - - class TokenizerWrapper: - def __init__(self, input_ids): - self.input_ids = input_ids - valenc = TokenizerWrapper(valenc) - - return trainloader, valenc - def get_loaders( name, nsamples=128, seed=0, seqlen=2048, model='' @@ -164,10 +106,6 @@ def get_loaders( if 'wikitext2' in name: return get_wikitext2(nsamples, seed, seqlen, model) if 'ptb' in name: - if 'new' in name: - return get_ptb_new(nsamples, seed, seqlen, model) return get_ptb(nsamples, seed, seqlen, model) if 'c4' in name: - if 'new' in name: - return get_c4_new(nsamples, seed, seqlen, model) - return get_c4(nsamples, seed, seqlen, model) + return get_c4(nsamples, seed, seqlen, model) \ No newline at end of file diff --git a/delta.txt b/delta.txt new file mode 100644 index 0000000..a1c817b --- /dev/null +++ b/delta.txt @@ -0,0 +1,5 @@ ++------+-----------+------+ +| Bits | wikitext2 | ptb | ++------+-----------+------+ +| 4 | None | None | ++------+-----------+------+ \ No newline at end of file diff --git a/delta_2_bits.txt b/delta_2_bits.txt new file mode 100644 index 0000000..f7647d2 --- /dev/null +++ b/delta_2_bits.txt @@ -0,0 +1,697 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.183259963989258 +Downloading and preparing dataset ptb_text_only/penn_treebank (download: 5.68 MiB, generated: 5.72 MiB, post-processed: Unknown size, total: 11.40 MiB) to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f... +Dataset ptb_text_only downloaded and prepared to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f. Subsequent calls will reuse this data. +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.479209899902344 +Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde... +Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data. +Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde... +Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data. +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.7892680168151855 diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt new file mode 100644 index 0000000..8089e8e --- /dev/null +++ b/delta_2bits_sparse_09.txt @@ -0,0 +1,1054 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.098198890686035 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.15268325805664 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.6517863273620605 diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt new file mode 100644 index 0000000..a7ba5b7 --- /dev/null +++ b/delta_2bits_sparse_099.txt @@ -0,0 +1,1054 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.087564945220947 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.305665969848633 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.614907741546631 diff --git a/delta_4bits.txt b/delta_4bits.txt new file mode 100644 index 0000000..4cecc92 --- /dev/null +++ b/delta_4bits.txt @@ -0,0 +1 @@ +Starting ... diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt new file mode 100644 index 0000000..8089e8e --- /dev/null +++ b/delta_4bits_sparse_09.txt @@ -0,0 +1,1054 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.098198890686035 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.15268325805664 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.6517863273620605 diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt new file mode 100644 index 0000000..a7ba5b7 --- /dev/null +++ b/delta_4bits_sparse_099.txt @@ -0,0 +1,1054 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.087564945220947 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.305665969848633 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.614907741546631 diff --git a/delta_sparse_09.txt b/delta_sparse_09.txt new file mode 100644 index 0000000..a1c817b --- /dev/null +++ b/delta_sparse_09.txt @@ -0,0 +1,5 @@ ++------+-----------+------+ +| Bits | wikitext2 | ptb | ++------+-----------+------+ +| 4 | None | None | ++------+-----------+------+ \ No newline at end of file diff --git a/delta_sparse_099.txt b/delta_sparse_099.txt new file mode 100644 index 0000000..a1c817b --- /dev/null +++ b/delta_sparse_099.txt @@ -0,0 +1,5 @@ ++------+-----------+------+ +| Bits | wikitext2 | ptb | ++------+-----------+------+ +| 4 | None | None | ++------+-----------+------+ \ No newline at end of file diff --git a/evaluation.sh b/evaluation.sh new file mode 100755 index 0000000..d995499 --- /dev/null +++ b/evaluation.sh @@ -0,0 +1,66 @@ +CUDA_VISIBLE_DEVICES=7 python3 -u llama_delta.py \ + --wbits 4 \ + --true-sequential --act-order --new-eval\ + --groupsize 1024 > delta_4bits.txt & + +CUDA_VISIBLE_DEVICES=2 python3 -u llama_delta.py \ + --wbits 2 \ + --true-sequential --act-order --new-eval\ + --groupsize 1024 > delta_2_bits.txt & + +CUDA_VISIBLE_DEVICES=6 python3 -u llama_delta.py \ + --groupsize 1024 \ + --wbits 4 \ + --true-sequential --act-order --new-eval\ + --sparsify_hard_threshold \ + --fraction_of_zero 0.9 > delta_4bits_sparse_09.txt & + +CUDA_VISIBLE_DEVICES=5 python3 -u llama_delta.py \ + --groupsize 1024 \ + --wbits 2 \ + --true-sequential --act-order --new-eval\ + --sparsify_hard_threshold \ + --fraction_of_zero 0.9 > delta_2bits_sparse_09.txt & + +CUDA_VISIBLE_DEVICES=4 python3 -u llama_delta.py \ + --groupsize 1024 \ + --wbits 4 \ + --true-sequential --act-order --new-eval\ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 > delta_4bits_sparse_099.txt & + +CUDA_VISIBLE_DEVICES=3 python3 -u llama_delta.py \ + --groupsize 1024 \ + --wbits 2 \ + --true-sequential --act-order --new-eval\ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 > delta_2bits_sparse_099.txt & + +# & +# CUDA_VISIBLE_DEVICES=4 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 16 \ +# --benchmark_results "file_4.txt" \ +#& +# CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 64 \ +# --benchmark_results "file_5.txt" \ +# & +# CUDA_VISIBLE_DEVICES=6 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 32 \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.9 \ +# --benchmark_results "file_6.txt" \ +# & +# CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 32 \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.99 \ +# --benchmark_results "file_7.txt" \ No newline at end of file diff --git a/file_0.txt b/file_0.txt new file mode 100644 index 0000000..7fd8537 --- /dev/null +++ b/file_0.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+-------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+-------------------+--------------------+ +| 2 | 107356160 | 315.56584095954895 | 12.70229721069336 | 18.99186134338379 | 16.049821853637695 | +| 3 | 107356160 | 254.49543404579163 | 12.98267936706543 | 19.62110710144043 | 16.652606964111328 | +| 4 | 107356160 | 285.25878047943115 | 12.996271133422852 | 19.65008544921875 | 16.664426803588867 | ++------+-----------+--------------------+--------------------+-------------------+--------------------+ \ No newline at end of file diff --git a/file_1.txt b/file_1.txt new file mode 100644 index 0000000..1af604f --- /dev/null +++ b/file_1.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 288.47421526908875 | 12.124371528625488 | 17.396339416503906 | 15.110072135925293 | +| 3 | 107356160 | 310.34640645980835 | 12.246709823608398 | 17.316566467285156 | 14.97178840637207 | +| 4 | 107356160 | 262.9206793308258 | 12.252873420715332 | 17.329992294311523 | 14.979094505310059 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_2.txt b/file_2.txt new file mode 100644 index 0000000..4ca10d2 --- /dev/null +++ b/file_2.txt @@ -0,0 +1,7 @@ ++------+-----------+-------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+-------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 289.8132817745209 | 13.843452453613281 | 16.968669891357422 | 14.779077529907227 | +| 3 | 107356160 | 307.7978012561798 | 13.91087532043457 | 16.95600700378418 | 14.742414474487305 | +| 4 | 107356160 | 262.0493402481079 | 13.913723945617676 | 16.955684661865234 | 14.743617057800293 | ++------+-----------+-------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_3.txt b/file_3.txt new file mode 100644 index 0000000..672862a --- /dev/null +++ b/file_3.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 283.91542887687683 | 12.507635116577148 | 18.553525924682617 | 15.613986015319824 | +| 3 | 107356160 | 287.85402369499207 | 12.571398735046387 | 18.915355682373047 | 15.952068328857422 | +| 4 | 107356160 | 279.67540669441223 | 12.590620040893555 | 18.968795776367188 | 15.981791496276855 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_4.txt b/file_4.txt new file mode 100644 index 0000000..e2c3608 --- /dev/null +++ b/file_4.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 274.58040595054626 | 12.96647834777832 | 18.44032859802246 | 15.488606452941895 | +| 3 | 107356160 | 277.05651092529297 | 12.934049606323242 | 18.722591400146484 | 15.750381469726562 | +| 4 | 107356160 | 282.69956731796265 | 12.932695388793945 | 18.789344787597656 | 15.76345443725586 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_5.txt b/file_5.txt new file mode 100644 index 0000000..5ef55d7 --- /dev/null +++ b/file_5.txt @@ -0,0 +1,10 @@ +LLAMA - Experiment results + ++------+-----------+--------------------+--------------------+--------------------+ +| Bits | n_params | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+ +| 4 | 107356160 | 2.9947431087493896 | 1.011309266090393 | 1.0010896921157837 | ++------+-----------+--------------------+--------------------+--------------------+ +| 4 | 107356160 | 2.9947431087493896 | 1.011309266090393 | 1.0010896921157837 | ++------+-----------+--------------------+--------------------+--------------------+ + diff --git a/fmzip.py b/fmzip.py new file mode 100644 index 0000000..e69de29 diff --git a/gptj.py b/gptj.py new file mode 100644 index 0000000..0ae4900 --- /dev/null +++ b/gptj.py @@ -0,0 +1,551 @@ +import time +import math + +import torch +import torch.nn as nn +import transformers + +from gptq import * +from modelutils import * +from quant import * +from prettytable import PrettyTable +import os + +def get_gptj(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import GPTJForCausalLM + # print(model) + model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = model.config.max_position_embeddings + print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad)) + return model + +@torch.no_grad() +def gptj_sequential(model, dataloader, dev, means=None, stds=None): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + #print(model.transformer.h) + layers = model.transformer.h + #print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_sequential_delta(model, delta_model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.transformer.h + delta_layers = delta_model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = original_outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + # print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache ['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + model.transformer.ln_f = model.transformer.ln_f.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + hidden_states = model.transformer.ln_f(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + + model.config.use_cache = use_cache + +def gptj_pack(model, quantizers, wbits, groupsize): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) + print('Packing ...') + for name in qlayers: + print(name) + quantizers[name],scale,zero = quantizers[name] + quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() + qlayers[name].pack(layers[name], scale, zero) + print('Done!') + return model + +def load_quant(model, checkpoint, wbits, groupsize): + from transformers import GPTJConfig, GPTJForCausalLM + config = GPTJConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = GPTJForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done!') + + return model + +def gptj_multigpu(model, gpus): + model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) + if hasattr(model.model, 'norm') and model.model.norm: + model.model.norm = model.model.norm.to(gpus[-1]) + import copy + model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) + + cache = {'mask': None} + + class MoveModule(nn.Module): + def __init__(self, module): + super().__init__() + self_module = module + self.dev = next(iter(self.module.parameters())).device + def forward(self, *inp, **kwargs): + inp = list(inp) + if inp[0].device != self.dev: + inp[0] = inp[0].to(self.dev) + if cache['mask'] is None or cache ['mask'].device != self.dev: + cache['mask'] = kwargs['attention_mask'].to(self.dev) + kwargs['attention_mask'] = cache['mask'] + tmp = self.module(*inp, **kwargs) + return tmp + + layers = model.model.layers + pergpu = math.ceil(len(layers) / len(gpus)) + for i in range(len(layers)): + layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) + + model.gpus = gpus + +def benchmark(model, input_ids, check=False): + print(model) + input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) + torch.cuda.synchronize() + + cache = {'past': None} + def clear_past(i): + def tmp(layer, inp, out): + if cache['past']: + cache['past'][i] = None + return tmp + for i, layer in enumerate(model.transformer.h): + layer.register_forward_hook(clear_past(i)) + + print('Benchmarking ...') + + if check: + loss = nn.CrossEntropyLoss() + tot = 0. + + def sync(): + if hasattr(model, 'gpus'): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + max_memory = 0 + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + + out = model( + input_ids[:, i:i+1], + past_key_values=cache['past'], + attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) + ) + sync() + times.append(time.time() - tick) + if i != input_ids.numel() - 1: + tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() + cache['past'] = list(out.past_key_values) + del out + sync() + import numpy as np + print('Median:', np.median(times)) + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) + +def main(args): + print(args) + if args.load: + model = load_quant3(args.model, args.load) + else: + model = get_gptj(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + print("Quantizing ...") + quantizers = gptj_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + dataloader, testloader = get_loaders( + args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + ppl = gptj_eval(model, testloader, DEV) + + if args.save: + gptj_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) + + return ppl + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model', type=str, default='EleutherAI/gpt-j-6b', + help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' + ) + parser.add_argument( + '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--save', type=str, default='', + help='Save the quantized GPT-J model under this name.' + ) + parser.add_argument( + '--save_safetensors', type=str, default='', + help='Save the quantized GPT-J model as a `.safetensors` ckpt' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load the quantized GPT-J model' + ) + parser.add_argument( + '--benchmark', type=int, default=0, + help='Number of tokens to use for benchmarking.' + ) + parser.add_argument( + '--check', action='store_true', + help='Whether to compute perpexity during benchmarking for verification.' + ) + parser.add_argument( + '--benchmark_results', type=str, default='', + help='store benchmark results' + ) + + args = parser.parse_args() + results = PrettyTable() + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [16]: + ppls = [] + for dataset in ['wikitext2']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(results) + with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + f.write(str(results)) + print('finished.') \ No newline at end of file diff --git a/gptj_delta.py b/gptj_delta.py new file mode 100644 index 0000000..490e3e5 --- /dev/null +++ b/gptj_delta.py @@ -0,0 +1,589 @@ +import time +import math + +import torch +import torch.nn as nn +import transformers + +from gptq import * +from modelutils import * +from quant import * +from prettytable import PrettyTable +import os +import copy + +def get_gptj(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import GPTJForCausalLM + model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + model.seqlen = model.config.max_position_embeddings + print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad)) + return model + +@torch.no_grad() +def gptj_sequential(model, dataloader, dev, means=None, stds=None): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + #print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_sequential_delta(model, delta_model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.transformer.h + delta_layers = delta_model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = original_outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + # print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache ['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) + try: + print(batch.shape) + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + model.transformer.ln_f = model.transformer.ln_f.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + hidden_states = model.transformer.ln_f(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + + model.config.use_cache = use_cache + +def gptj_pack(model, quantizers, wbits, groupsize): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) + print('Packing ...') + for name in qlayers: + print(name) + quantizers[name],scale,zero = quantizers[name] + quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() + qlayers[name].pack(layers[name], scale, zero) + print('Done!') + return model + +def load_quant(model, checkpoint, wbits, groupsize): + from transformers import GPTJConfig, GPTJForCausalLM + config = GPTJConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = GPTJForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done!') + + return model + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) + torch.cuda.synchronize() + + cache = {'past': None} + def clear_past(i): + def tmp(layer, inp, out): + if cache['past']: + cache['past'][i] = None + return tmp + for i, layer in enumerate(model.transformer.h): + layer.register_forward_hook(clear_past(i)) + + print('Benchmarking ...') + + if check: + loss = nn.CrossEntropyLoss() + tot = 0. + + def sync(): + if hasattr(model, 'gpus'): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + max_memory = 0 + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + + out = model( + input_ids[:, i:i+1], + past_key_values=cache['past'], + attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024) + if check and i != input_ids.numel() - 1: + tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() + cache['past'] = list(out.past_keys_values) + del out + sync() + import numpy as np + print('Median:', np.median(times)) + if check: + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) + + + +def main(args): + print(args) + num_params_saved_lr = 0 + num_params = 0 + if args.load: + model = load_quant3(args.model, args.load) + else: + if args.delta and args.wbits<16: + model = get_gptj(args.model) + model.eval() + base_model = get_gptj(args.base_model) + base_model.eval() + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + original_finetuned_model = copy.deepcopy(model) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data-base_p.data).clone() + else: + model = get_gptj(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + if args.delta: + tick = time.time() + quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV) + + comp_time = time.time()-tick + else: + quantizers = gptj_sequential(model, dataloader, DEV) + + if args.delta and args.wbits<16: + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + if args.sparsify_hard_threshold: + print('Hard Thresholding...') + W = finetuned_p.data + finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + if args.rank>0 and len(finetuned_p.shape) == 2: + print('Finding Low Rank Approximation...') + A = finetuned_p.data.float() + U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) + A = U @ torch.diag_embed(S) @ Vh.T + finetuned_p.data = A.half() + num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) + num_params += torch.numel(finetuned_p.data) + finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + if args.benchmark: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + dataset = args.dataset + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + ppl = gptj_eval(model, testloader, DEV) + print(ppl) + + if args.rank > 0: + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print("Number of params without low rank ", n_params) + print("Number of params with low rank", n_params - num_params_saved_lr) + if args.save: + gptj_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) + return ppl + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model', type=str, default='togethercomputer/GPT-JT-6B-v1', + help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.' + ) + parser.add_argument( + '--base_model', type=str, default='EleutherAI/gpt-j-6b', + help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' + ) + parser.add_argument( + '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--save', type=str, default='', + help='Save the quantized GPT-J model under this name.' + ) + parser.add_argument( + '--save_safetensors', type=str, default='', + help='Save the quantized GPT-J model as a `.safetensors` ckpt' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load the quantized GPT-J model' + ) + parser.add_argument( + '--benchmark', type=int, default=0, + help='Number of tokens to use for benchmarking.' + ) + parser.add_argument( + '--check', action='store_true', + help='Whether to compute perpexity during benchmarking for verification.' + ) + parser.add_argument( + '--delta', action='store_true', + help='Whether to use delta compression' + ) + parser.add_argument( + '--sparsify_hard_threshold', action='store_true', + help='Whether to add sparsity' + ) + parser.add_argument( + '--fraction_of_zero', type=float, default=0.99, + help='Sparsity ratio' + ) + parser.add_argument( + '--benchmark_results', type=str, default='', + help='store benchmark results' + ) + parser.add_argument( + '--sym', action='store_true', default=True, + help='Whether to use symmetric quantization' + ) + parser.add_argument( + '--trits', action='store_true', default=False, + help='Whether to use trits' + ) + parser.add_argument('--act_order', type=str, default=False) + + args = parser.parse_args() + + results = PrettyTable() + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [4, 3, 2]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(results) + with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + f.write(str(results)) + print('finished.') \ No newline at end of file diff --git a/gptq.py b/gptq.py index 2477cac..e60f1fc 100644 --- a/gptq.py +++ b/gptq.py @@ -126,14 +126,14 @@ def fasterquant( if DEBUG: self.layer.weight.data[:, :i2] = Q[:, :i2] self.layer.weight.data[:, i2:] = W[:, i2:] - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - print(torch.sum(Losses)) + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + #print(torch.sum(Losses)) torch.cuda.synchronize() total_time = time.time() - tick - # print('time %.2f' % total_time) + # #print('time %.2f' % total_time) error = torch.sum(Losses).item() - # print('error', error) + # #print('error', error) if actorder: invperm = torch.argsort(perm) @@ -143,8 +143,8 @@ def fasterquant( Q = Q.t() self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) if DEBUG: - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + pass def free(self): if DEBUG: self.inp1 = None @@ -152,4 +152,4 @@ def free(self): self.H = None self.Losses = None self.Trace = None - torch.cuda.empty_cache() + torch.cuda.empty_cache() \ No newline at end of file diff --git a/jt_datautils/cot.py b/jt_datautils/cot.py new file mode 100644 index 0000000..15796e9 --- /dev/null +++ b/jt_datautils/cot.py @@ -0,0 +1,105 @@ +import os +import re +import torch +import json +from torch.utils.data import IterableDataset, DataLoader +from itertools import cycle, islice +import random +from datasets import Dataset +from datasets import load_dataset, load_from_disk +#from comm.comm_utils import * + + + +class StreamDataset(IterableDataset): + def __init__(self, cot_data_path, tokenizer, seq_length=1024): + + self.cot_data_path = cot_data_path + + with open(cot_data_path) as f: + self.cot_data = json.load(f) + + self.buffer_tokens = [] + + self.tokenizer = tokenizer + self.seq_length = seq_length + + self.it = None + + def state_dict(self): + return {} + + def load_state_dict(self, state_dict): + pass + + def get_sequence_from_cot(self): + + while True: + + keys = list(self.cot_data.keys()) + random.shuffle(keys) + + input_ids = [] + + for k in keys: + + v = self.cot_data[k] + + input_ids += self.tokenizer(v + '\n\n')['input_ids'] + if len(input_ids) < self.seq_length: + continue + # input_ids += [self.tokenizer.eos_token_id]*(self.seq_length - len(input_ids)) + + input_ids = input_ids[:self.seq_length] + input_ids = torch.tensor(input_ids).long() + + yield input_ids + + input_ids = [] + + def get_sequence(self): + + it_cot = cycle(self.get_sequence_from_cot()) + + while True: + + input_ids = next(it_cot) + + + yield { + 'input_ids': input_ids, + } + + + def get_stream(self): + return cycle(self.get_sequence()) + + def __iter__(self): + if self.it is None: + self.it = self.get_stream() + return self.it + + + +def get_cot_train_data_loader(args, tokenizer, num_workers=0, state_dict=None): + + stream_dataset = StreamDataset( + './data/mmlu-cot.json', + tokenizer=tokenizer, seq_length=args.seq_length + ) + + if state_dict is not None: + stream_dataset.load_state_dict(state_dict) + + train_data_loader = torch.utils.data.DataLoader(stream_dataset, + batch_size=args.batch_size * args.data_group_size, + shuffle=False, + num_workers=num_workers, + pin_memory=True, + collate_fn=None) + return train_data_loader + +def get_cot_ds(data_path, tokenizer, seq_length): + return StreamDataset(os.path.join(data_path,'mmlu-cot.json'), + tokenizer=tokenizer, seq_length=seq_length + ) \ No newline at end of file diff --git a/jt_datautils/pile.py b/jt_datautils/pile.py new file mode 100644 index 0000000..fbddca5 --- /dev/null +++ b/jt_datautils/pile.py @@ -0,0 +1,77 @@ + +import os +import re +import torch +from torch.utils.data import IterableDataset, DataLoader +from itertools import cycle, islice +import random +from datasets import Dataset +from datasets import load_dataset, load_from_disk +# from comm.comm_utils import * + + +class StreamDataset(IterableDataset): + default_doc_separator = '' + def __init__(self, data, tokenizer, seq_length=1024, doc_separator=None): + self.data = data + self.tokenizer = tokenizer + self.seq_length = seq_length + self.doc_separator = doc_separator or StreamDataset.default_doc_separator + self.it = None + self.iter_count = 0 + self.buffer_tokens = [] + + def state_dict(self): + return { + 'iter_count': self.iter_count, + 'buffer_tokens': self.buffer_tokens, + } + + def load_state_dict(self, state_dict): + self.iter_count = state_dict['iter_count'] + self.buffer_tokens = state_dict['buffer_tokens'] + self.data = self.data.skip(self.iter_count) + + def get_sequence(self): + buffer_tokens = self.buffer_tokens + for x in self.data: + self.iter_count += 1 + curr_tokens = self.tokenizer(self.doc_separator + x['text'])['input_ids'] + buffer_tokens += curr_tokens + while len(buffer_tokens) >= self.seq_length: + tokens = buffer_tokens[:self.seq_length] + buffer_tokens = buffer_tokens[self.seq_length:] + input_ids = torch.tensor(tokens) + self.buffer_tokens = buffer_tokens # update for restore + yield { + 'input_ids': input_ids, + } + + def get_stream(self): + return cycle(self.get_sequence()) + + def __iter__(self): + if self.it is None: + self.it = self.get_stream() + return self.it + + +def get_pile_train_data_loader(args, tokenizer, num_workers=0, state_dict=None): + + data = load_dataset('the_pile', split="train", streaming=True).shuffle(buffer_size=10_000, seed=args.seed) + stream_dataset = StreamDataset(data, tokenizer, args.seq_length) + + if state_dict is not None: + stream_dataset.load_state_dict(state_dict) + + train_data_loader = torch.utils.data.DataLoader(stream_dataset, + batch_size=args.batch_size * args.data_group_size, + shuffle=False, + num_workers=num_workers, + pin_memory=True, + collate_fn=None) + return train_data_loader + +def get_pile_ds(tokenizer, seq_length): + data = load_dataset("the_pile", split="train", streaming=True) + return StreamDataset(data, tokenizer, seq_length) \ No newline at end of file diff --git a/llama.py b/llama.py new file mode 100644 index 0000000..f1591d9 --- /dev/null +++ b/llama.py @@ -0,0 +1,302 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * + + +def get_llama(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = 2048 + return model + +@torch.no_grad() +def llama_sequential(model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + model.model.norm = model.model.norm.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + model.model.norm = model.model.norm.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + full = find_layers(layer) + + if args.true_sequential: + sequential = [ + ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], + ['self_attn.o_proj'], + ['mlp.up_proj', 'mlp.gate_proj'], + ['mlp.down_proj'] + ] + else: + sequential = [list(full.keys())] + + for names in sequential: + subset = {n: full[n] for n in names} + + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def llama_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.norm is not None: + model.model.norm = model.model.norm.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.norm is not None: + hidden_states = model.model.norm(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + 'model', type=str, + help='LlaMa model to load; pass location of hugginface converted checkpoint.' + ) + parser.add_argument( + 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--sym', action='store_true', + help='Whether to perform symmetric quantization.' + ) + parser.add_argument( + '--new-eval', action='store_true', + help='Whether to use the new PTB and C4 eval.' + ) + parser.add_argument( + '--act-order', action='store_true', + help='Whether to apply the activation order GPTQ heuristic' + ) + parser.add_argument( + '--true-sequential', action='store_true', + help='Whether to run in true sequential model.' + ) + + args = parser.parse_args() + + model = get_llama(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = llama_sequential(model, dataloader, DEV) + print(time.time() - tick) + + datasets = ['wikitext2', 'ptb', 'c4'] + if args.new_eval: + datasets = ['wikitext2', 'ptb-new', 'c4-new'] + for dataset in datasets: + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + llama_eval(model, testloader, DEV) \ No newline at end of file diff --git a/llama_delta.py b/llama_delta.py new file mode 100644 index 0000000..d8fa99a --- /dev/null +++ b/llama_delta.py @@ -0,0 +1,441 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * +import copy +import os + +def hard_threshold(x, fraction_of_zero=0.1): + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + return mask * x + +def get_llama(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = 2048 + return model + +@torch.no_grad() +def llama_sequential_delta(model, delta_model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + + layers = model.model.layers + delta_layers = delta_model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + model.model.norm = model.model.norm.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + model.model.norm = model.model.norm.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + print('Ready.') + + quantizers = {} + for i in range(len(delta_layers)): + layer = layers[i].to(dev) + full = find_layers(layer) + + if args.true_sequential: + sequential = [ + ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], + ['self_attn.o_proj'], + ['mlp.up_proj', 'mlp.gate_proj'], + ['mlp.down_proj'] + ] + else: + sequential = [list(full.keys())] + + for names in sequential: + subset = {n: full[n] for n in names} + + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def llama_sequential(model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + model.model.norm = model.model.norm.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + model.model.norm = model.model.norm.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + full = find_layers(layer) + + if args.true_sequential: + sequential = [ + ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], + ['self_attn.o_proj'], + ['mlp.up_proj', 'mlp.gate_proj'], + ['mlp.down_proj'] + ] + else: + sequential = [list(full.keys())] + + for names in sequential: + subset = {n: full[n] for n in names} + + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def llama_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.norm is not None: + model.model.norm = model.model.norm.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.norm is not None: + hidden_states = model.model.norm(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model', type=str, default='ausboss/llama-13b-supercot', + help='LlaMa model to load; pass location of hugginface converted checkpoint.' + ) + parser.add_argument( + '--base-model', type=str, default='yahma/llama-13b-hf', + help='base LLAMA model to load' + ) + parser.add_argument( + '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--sym', action='store_true', + help='Whether to perform symmetric quantization.' + ) + parser.add_argument( + '--new-eval', action='store_true', + help='Whether to use the new PTB and C4 eval.' + ) + parser.add_argument( + '--act-order', action='store_true', + help='Whether to apply the activation order GPTQ heuristic' + ) + parser.add_argument( + '--true-sequential', action='store_true', + help='Whether to run in true sequential model.' + ) + parser.add_argument( + '--sparsify_hard_threshold', action='store_true', + help='Whether to add sparsity' + ) + parser.add_argument( + '--fraction_of_zero', type=float, default=0.99, + help='Sparsity ratio' + ) + args = parser.parse_args() + + base_model = get_llama(args.base_model) + model = get_llama(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + original_finetuned_model = copy.deepcopy(model) + _ = llama_sequential_delta(original_finetuned_model, model, dataloader, DEV) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data-base_p.data).clone() + + + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + if args.sparsify_hard_threshold: + print('Hard Thresholding...') + W = finetuned_p.data + finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + datasets = ['wikitext2', 'ptb', 'c4'] + if args.new_eval: + datasets = ['wikitext2', 'ptb-new', 'c4-new'] + for dataset in datasets: + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + llama_eval(model, testloader, DEV) \ No newline at end of file diff --git a/modelutils.py b/modelutils.py index c93410d..5b36877 100644 --- a/modelutils.py +++ b/modelutils.py @@ -1,8 +1,10 @@ import torch import torch.nn as nn -from transformers import OPTForCausalLM + + DEV = torch.device('cuda:0') + def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): if type(module) in layers: return {name: module} @@ -11,16 +13,4 @@ def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): res.update(find_layers( child, layers=layers, name=name + '.' + name1 if name != '' else name1 )) - return res - -def get_opt(model): - def skip(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - - # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') - model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16) - model.seqlen = model.config.max_position_embeddings - return model \ No newline at end of file + return res \ No newline at end of file diff --git a/opt.py b/opt.py index edf40bc..77f0f9b 100644 --- a/opt.py +++ b/opt.py @@ -6,7 +6,7 @@ from gptq import * from modelutils import * from quant import quantize, Quantizer, Quant3Linear, make_quant3 - +from prettytable import PrettyTable def get_opt(model): import torch def skip(*args, **kwargs): @@ -223,6 +223,7 @@ def forward(self, inp, **kwargs): print(ppl.item()) model.config.use_cache = use_cache + return ppl.item() # TODO: perform packing on GPU def opt_pack3(model, quantizers): @@ -351,6 +352,48 @@ def sync(): print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) +def main(args): + if args.load: + model = load_quant3(args.model, args.load) + else: + model = get_opt(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = opt_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + opt_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + + dataloader, testloader = get_loaders( + args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + ppl = opt_eval(model, testloader, DEV) + + if args.save: + opt_pack3(model, quantizers) + torch.save(model.state_dict(), args.save) + + return ppl + + if __name__ == '__main__': import argparse from datautils import * @@ -358,11 +401,11 @@ def sync(): parser = argparse.ArgumentParser() parser.add_argument( - 'model', type=str, + '--model', type=str, default='lnair/opt-1.3b-wikitext2', help='OPT model to load; pass `facebook/opt-X`.' ) parser.add_argument( - 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.' ) parser.add_argument( @@ -427,44 +470,17 @@ def sync(): ) args = parser.parse_args() - - if args.load: - model = load_quant3(args.model, args.load) - else: - model = get_opt(args.model) - model.eval() - - dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - - if args.wbits < 16 and not args.nearest: - tick = time.time() - quantizers = opt_sequential(model, dataloader, DEV) - print(time.time() - tick) - - if args.benchmark: - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - opt_multigpu(model, gpus) - else: - model = model.to(DEV) - if args.benchmark: - input_ids = next(iter(dataloader))[0][:, :args.benchmark] - benchmark(model, input_ids, check=args.check) - if args.load: - exit() - - datasets = ['wikitext2', 'ptb', 'c4'] - if args.new_eval: - datasets = ['wikitext2', 'ptb-new', 'c4-new'] - for dataset in datasets: - dataloader, testloader = get_loaders( - dataset, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - print(dataset) - opt_eval(model, testloader, DEV) - - if args.save: - opt_pack3(model, quantizers) - torch.save(model.state_dict(), args.save) + + results = PrettyTable() + results.field_names = ['Bits', 'wiki', 'ptb', 'c4'] + for n_bits in [4, 3, 2]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, ppls[0], ppls[1], ppls[2]]) + print(results) + print('finished.') diff --git a/opt_delta.py b/opt_delta.py index f9f6bbf..481cc9a 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -7,8 +7,9 @@ from gptq import * from modelutils import * from quant import * - +from prettytable import PrettyTable import copy +import os #from prettytable import PrettyTable def get_opt(model): @@ -539,16 +540,17 @@ def main(args): dataset, seed=args.seed, model=args.model, seqlen=model.seqlen ) - ppl = opt_eval(model, testloader, DEV) - print(ppl) + # ppl = opt_eval(model, testloader, DEV) + # print(ppl) if args.rank > 0: - print("Number of params without low rank ", num_params) - print("Number of params with low rank", num_params - num_params_saved_lr) + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print("Number of params without low rank ", n_params) + print("Number of params with low rank", n_params - num_params_saved_lr) if args.save: opt_pack3(model, quantizers) torch.save(model.state_dict(), args.save) - + return ppl, n_params, comp_time if __name__ == '__main__': import argparse @@ -601,13 +603,17 @@ def main(args): help='Whether to perform symmetric quantization.' ) parser.add_argument( - '--save', type=str, default='', + '--save', type=str, default='opt-1.3b-wikitext2-wbits2.pt', help='Save quantized checkpoint under this name.' ) parser.add_argument( '--load', type=str, default='', help='Load quantized model.' ) + parser.add_argument( + '--benchmark_results', type=str, default='', + help='store benchmark results' + ) parser.add_argument( '--benchmark', type=int, default=0, help='Number of tokens to use for benchmarking.' @@ -647,8 +653,18 @@ def main(args): ) args = parser.parse_args() - #results = PrettyTable() - - main(args) - + results = PrettyTable() + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [2]: + ppls = [] + for dataset in ['wikitext2']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'opt-not-delta1.3b-%s-wbits%d.pt' % (dataset, n_bits) + ppl, n_params, comp_time = main(args) + # ppls.append(ppl) + # results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + # print(results) + # with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + # f.write(str(results)) print('finished.') diff --git a/quant.py b/quant.py index f8cc1b7..f57d6d7 100644 --- a/quant.py +++ b/quant.py @@ -131,8 +131,8 @@ def ready(self): try: import quant_cuda except: - print('CUDA extension not installed.') - + #print('CUDA extension not installed.') + pass # Assumes layer is perfectly divisible into 1024 * 1024 blocks class Quant3Linear(nn.Module): @@ -356,4 +356,4 @@ def make_quant(module, names, bits, groupsize, name=''): module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features) ) for name1, child in module.named_children(): - make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) + make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7417000..321525d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -transformers -loguru -datasets \ No newline at end of file +safetensors==0.3.0 +datasets==1.17.0 +sentencepiece +transformers==4.21.2 +ninja \ No newline at end of file diff --git a/src/fmzip b/src/fmzip new file mode 160000 index 0000000..b41e785 --- /dev/null +++ b/src/fmzip @@ -0,0 +1 @@ +Subproject commit b41e7856f092c80286577b2eb5e1294a764099d6