diff --git a/.gitignore b/.gitignore
index dbd6338..83ed498 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,11 @@
 *.pyc
+*.pt
 build/
 dist/
 .idea
 *.egg-info/
 *.safetensors
-outputs/
\ No newline at end of file
+outputs/
+.cache/
+data/
+results/
\ No newline at end of file
diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000..acf2ea0
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,2 @@
+wbits = [2, 3, 4, 8]
+sparsity = [0.0, 0.5, 0.9]
\ No newline at end of file
diff --git a/datautils.py b/datautils.py
index 045121a..a269a22 100644
--- a/datautils.py
+++ b/datautils.py
@@ -1,10 +1,12 @@
 import numpy as np
 import torch
 
+
 def set_seed(seed):
     np.random.seed(seed)
     torch.random.manual_seed(seed)
 
+
 def get_wikitext2(nsamples, seed, seqlen, model):
     from datasets import load_dataset
     traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
@@ -97,66 +99,6 @@ def __init__(self, input_ids):
 
     return trainloader, valenc 
 
-def get_ptb_new(nsamples, seed, seqlen, model):
-    from datasets import load_dataset
-    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
-    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
-
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
-    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')
-
-    import random
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-    return trainloader, testenc
-
-def get_c4_new(nsamples, seed, seqlen, model):
-    from datasets import load_dataset
-    traindata = load_dataset(
-        'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
-    )
-    valdata = load_dataset(
-        'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation'
-    )
-
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-
-    import random
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        while True:
-            i = random.randint(0, len(traindata) - 1)
-            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
-            if trainenc.input_ids.shape[1] >= seqlen:
-                break
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-
-    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
-    valenc = valenc.input_ids[:, :(256 * seqlen)]
-
-    class TokenizerWrapper:
-        def __init__(self, input_ids):
-            self.input_ids = input_ids
-    valenc = TokenizerWrapper(valenc)
-
-    return trainloader, valenc
-
 
 def get_loaders(
     name, nsamples=128, seed=0, seqlen=2048, model=''
@@ -164,10 +106,6 @@ def get_loaders(
     if 'wikitext2' in name:
         return get_wikitext2(nsamples, seed, seqlen, model)
     if 'ptb' in name:
-        if 'new' in name:
-            return get_ptb_new(nsamples, seed, seqlen, model)
         return get_ptb(nsamples, seed, seqlen, model)
     if 'c4' in name:
-        if 'new' in name:
-            return get_c4_new(nsamples, seed, seqlen, model)
-        return get_c4(nsamples, seed, seqlen, model)
+        return get_c4(nsamples, seed, seqlen, model)
\ No newline at end of file
diff --git a/delta.txt b/delta.txt
new file mode 100644
index 0000000..a1c817b
--- /dev/null
+++ b/delta.txt
@@ -0,0 +1,5 @@
++------+-----------+------+
+| Bits | wikitext2 | ptb  |
++------+-----------+------+
+|  4   |    None   | None |
++------+-----------+------+
\ No newline at end of file
diff --git a/delta_2_bits.txt b/delta_2_bits.txt
new file mode 100644
index 0000000..f7647d2
--- /dev/null
+++ b/delta_2_bits.txt
@@ -0,0 +1,697 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.183259963989258
+Downloading and preparing dataset ptb_text_only/penn_treebank (download: 5.68 MiB, generated: 5.72 MiB, post-processed: Unknown size, total: 11.40 MiB) to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f...
+Dataset ptb_text_only downloaded and prepared to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f. Subsequent calls will reuse this data.
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.479209899902344
+Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...
+Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.
+Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...
+Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.7892680168151855
diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt
new file mode 100644
index 0000000..8089e8e
--- /dev/null
+++ b/delta_2bits_sparse_09.txt
@@ -0,0 +1,1054 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.098198890686035
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.15268325805664
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.6517863273620605
diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt
new file mode 100644
index 0000000..a7ba5b7
--- /dev/null
+++ b/delta_2bits_sparse_099.txt
@@ -0,0 +1,1054 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.087564945220947
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.305665969848633
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.614907741546631
diff --git a/delta_4bits.txt b/delta_4bits.txt
new file mode 100644
index 0000000..4cecc92
--- /dev/null
+++ b/delta_4bits.txt
@@ -0,0 +1 @@
+Starting ...
diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt
new file mode 100644
index 0000000..8089e8e
--- /dev/null
+++ b/delta_4bits_sparse_09.txt
@@ -0,0 +1,1054 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.098198890686035
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.15268325805664
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.6517863273620605
diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt
new file mode 100644
index 0000000..a7ba5b7
--- /dev/null
+++ b/delta_4bits_sparse_099.txt
@@ -0,0 +1,1054 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.087564945220947
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.305665969848633
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.614907741546631
diff --git a/delta_sparse_09.txt b/delta_sparse_09.txt
new file mode 100644
index 0000000..a1c817b
--- /dev/null
+++ b/delta_sparse_09.txt
@@ -0,0 +1,5 @@
++------+-----------+------+
+| Bits | wikitext2 | ptb  |
++------+-----------+------+
+|  4   |    None   | None |
++------+-----------+------+
\ No newline at end of file
diff --git a/delta_sparse_099.txt b/delta_sparse_099.txt
new file mode 100644
index 0000000..a1c817b
--- /dev/null
+++ b/delta_sparse_099.txt
@@ -0,0 +1,5 @@
++------+-----------+------+
+| Bits | wikitext2 | ptb  |
++------+-----------+------+
+|  4   |    None   | None |
++------+-----------+------+
\ No newline at end of file
diff --git a/evaluation.sh b/evaluation.sh
new file mode 100755
index 0000000..d995499
--- /dev/null
+++ b/evaluation.sh
@@ -0,0 +1,66 @@
+CUDA_VISIBLE_DEVICES=7 python3 -u llama_delta.py \
+    --wbits 4 \
+    --true-sequential --act-order --new-eval\
+    --groupsize 1024  > delta_4bits.txt & 
+
+CUDA_VISIBLE_DEVICES=2 python3 -u llama_delta.py \
+    --wbits 2 \
+    --true-sequential --act-order --new-eval\
+    --groupsize 1024  > delta_2_bits.txt & 
+
+CUDA_VISIBLE_DEVICES=6 python3 -u llama_delta.py \
+    --groupsize 1024 \
+    --wbits 4 \
+    --true-sequential --act-order --new-eval\
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.9 > delta_4bits_sparse_09.txt &
+
+CUDA_VISIBLE_DEVICES=5 python3 -u llama_delta.py \
+    --groupsize 1024 \
+    --wbits 2 \
+    --true-sequential --act-order --new-eval\
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.9 > delta_2bits_sparse_09.txt &
+
+CUDA_VISIBLE_DEVICES=4 python3 -u llama_delta.py \
+    --groupsize 1024 \
+    --wbits 4 \
+    --true-sequential --act-order --new-eval\
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 > delta_4bits_sparse_099.txt &
+
+CUDA_VISIBLE_DEVICES=3 python3 -u llama_delta.py \
+    --groupsize 1024 \
+    --wbits 2 \
+    --true-sequential --act-order --new-eval\
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 > delta_2bits_sparse_099.txt &
+
+# &
+# CUDA_VISIBLE_DEVICES=4 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 16  \
+#     --benchmark_results "file_4.txt" \
+#&
+# CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 64 \
+#     --benchmark_results "file_5.txt" \
+# &
+# CUDA_VISIBLE_DEVICES=6 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 32 \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.9 \
+#     --benchmark_results "file_6.txt" \
+# &
+# CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 32 \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.99 \
+#     --benchmark_results "file_7.txt" 
\ No newline at end of file
diff --git a/file_0.txt b/file_0.txt
new file mode 100644
index 0000000..7fd8537
--- /dev/null
+++ b/file_0.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+-------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb        |         c4         |
++------+-----------+--------------------+--------------------+-------------------+--------------------+
+|  2   | 107356160 | 315.56584095954895 | 12.70229721069336  | 18.99186134338379 | 16.049821853637695 |
+|  3   | 107356160 | 254.49543404579163 | 12.98267936706543  | 19.62110710144043 | 16.652606964111328 |
+|  4   | 107356160 | 285.25878047943115 | 12.996271133422852 | 19.65008544921875 | 16.664426803588867 |
++------+-----------+--------------------+--------------------+-------------------+--------------------+
\ No newline at end of file
diff --git a/file_1.txt b/file_1.txt
new file mode 100644
index 0000000..1af604f
--- /dev/null
+++ b/file_1.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 288.47421526908875 | 12.124371528625488 | 17.396339416503906 | 15.110072135925293 |
+|  3   | 107356160 | 310.34640645980835 | 12.246709823608398 | 17.316566467285156 | 14.97178840637207  |
+|  4   | 107356160 | 262.9206793308258  | 12.252873420715332 | 17.329992294311523 | 14.979094505310059 |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_2.txt b/file_2.txt
new file mode 100644
index 0000000..4ca10d2
--- /dev/null
+++ b/file_2.txt
@@ -0,0 +1,7 @@
++------+-----------+-------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time       |        wiki        |        ptb         |         c4         |
++------+-----------+-------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 289.8132817745209 | 13.843452453613281 | 16.968669891357422 | 14.779077529907227 |
+|  3   | 107356160 | 307.7978012561798 | 13.91087532043457  | 16.95600700378418  | 14.742414474487305 |
+|  4   | 107356160 | 262.0493402481079 | 13.913723945617676 | 16.955684661865234 | 14.743617057800293 |
++------+-----------+-------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_3.txt b/file_3.txt
new file mode 100644
index 0000000..672862a
--- /dev/null
+++ b/file_3.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 283.91542887687683 | 12.507635116577148 | 18.553525924682617 | 15.613986015319824 |
+|  3   | 107356160 | 287.85402369499207 | 12.571398735046387 | 18.915355682373047 | 15.952068328857422 |
+|  4   | 107356160 | 279.67540669441223 | 12.590620040893555 | 18.968795776367188 | 15.981791496276855 |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_4.txt b/file_4.txt
new file mode 100644
index 0000000..e2c3608
--- /dev/null
+++ b/file_4.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 274.58040595054626 | 12.96647834777832  | 18.44032859802246  | 15.488606452941895 |
+|  3   | 107356160 | 277.05651092529297 | 12.934049606323242 | 18.722591400146484 | 15.750381469726562 |
+|  4   | 107356160 | 282.69956731796265 | 12.932695388793945 | 18.789344787597656 | 15.76345443725586  |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_5.txt b/file_5.txt
new file mode 100644
index 0000000..5ef55d7
--- /dev/null
+++ b/file_5.txt
@@ -0,0 +1,10 @@
+LLAMA - Experiment results
+
++------+-----------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+
+|  4   | 107356160 | 2.9947431087493896 | 1.011309266090393  | 1.0010896921157837 |
++------+-----------+--------------------+--------------------+--------------------+
+|  4   | 107356160 | 2.9947431087493896 | 1.011309266090393  | 1.0010896921157837 |
++------+-----------+--------------------+--------------------+--------------------+
+
diff --git a/fmzip.py b/fmzip.py
new file mode 100644
index 0000000..e69de29
diff --git a/gptj.py b/gptj.py
new file mode 100644
index 0000000..0ae4900
--- /dev/null
+++ b/gptj.py
@@ -0,0 +1,551 @@
+import time
+import math
+
+import torch
+import torch.nn as nn
+import transformers
+
+from gptq import *
+from modelutils import *
+from quant import *
+from prettytable import PrettyTable
+import os
+
+def get_gptj(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import GPTJForCausalLM
+    # print(model)
+    model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = model.config.max_position_embeddings
+    print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad))
+    return model
+
+@torch.no_grad()
+def gptj_sequential(model, dataloader, dev, means=None, stds=None):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    #print(model.transformer.h)
+    layers = model.transformer.h
+    #print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=False, mse=False
+            )
+        
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_sequential_delta(model, delta_model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.transformer.h
+    delta_layers = delta_model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+            )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+            gptq[name].free()
+        
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = original_outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    # print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache ['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+    
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+
+        if args.nearest:
+            subset  = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    model.transformer.ln_f = model.transformer.ln_f.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+    
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        hidden_states = model.transformer.ln_f(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+    
+
+    model.config.use_cache = use_cache
+
+def gptj_pack(model, quantizers, wbits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [QuantLinear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name],scale,zero = quantizers[name]
+        quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
+        qlayers[name].pack(layers[name], scale, zero)
+    print('Done!')
+    return model
+
+def load_quant(model, checkpoint, wbits, groupsize):
+    from transformers import GPTJConfig, GPTJForCausalLM
+    config = GPTJConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = GPTJForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits, groupsize)
+
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done!')
+
+    return model
+
+def gptj_multigpu(model, gpus):
+    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
+    if hasattr(model.model, 'norm') and model.model.norm:
+        model.model.norm = model.model.norm.to(gpus[-1])
+    import copy
+    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
+
+    cache = {'mask': None}
+
+    class MoveModule(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self_module = module
+            self.dev = next(iter(self.module.parameters())).device
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache ['mask'].device != self.dev:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+
+    layers = model.model.layers
+    pergpu = math.ceil(len(layers) / len(gpus))
+    for i in range(len(layers)):
+        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
+
+    model.gpus = gpus
+
+def benchmark(model, input_ids, check=False):
+    print(model)
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.transformer.h):
+        layer.register_forward_hook(clear_past(i))
+
+    print('Benchmarking ...')
+
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    max_memory = 0
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+        
+            out = model(
+                input_ids[:, i:i+1],
+                past_key_values=cache['past'],
+                attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
+            )
+            sync()
+            times.append(time.time() - tick)
+            if i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_key_values)
+            del out
+        sync()
+        import numpy as np
+        print('Median:', np.median(times))
+        print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+        print('max memory(MiB):',max_memory)
+        
+def main(args):
+    print(args)
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        model = get_gptj(args.model)
+        model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        print("Quantizing ...")
+        quantizers = gptj_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    if args.benchmark:
+        model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    print(dataset)
+    ppl = gptj_eval(model, testloader, DEV)
+    
+    if args.save:
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save)
+        
+    return ppl
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--model', type=str, default='EleutherAI/gpt-j-6b',
+        help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
+    )
+    parser.add_argument(
+        '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    )
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--save', type=str, default='',
+        help='Save the quantized GPT-J model under this name.'
+    )
+    parser.add_argument(
+        '--save_safetensors', type=str, default='',
+        help='Save the quantized GPT-J model as a  `.safetensors` ckpt'
+    )
+    parser.add_argument(
+        '--load', type=str, default='',
+        help='Load the quantized GPT-J model'
+    )
+    parser.add_argument(
+        '--benchmark', type=int, default=0,
+        help='Number of tokens to use for benchmarking.'
+    )
+    parser.add_argument(
+        '--check', action='store_true',
+        help='Whether to compute perpexity during benchmarking for verification.'
+    )
+    parser.add_argument(
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
+    )
+    
+    args = parser.parse_args()
+    results = PrettyTable()
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [16]:
+        ppls = []
+        for dataset in ['wikitext2']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+        print(results)
+        with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+            f.write(str(results))
+    print('finished.')
\ No newline at end of file
diff --git a/gptj_delta.py b/gptj_delta.py
new file mode 100644
index 0000000..490e3e5
--- /dev/null
+++ b/gptj_delta.py
@@ -0,0 +1,589 @@
+import time
+import math
+
+import torch
+import torch.nn as nn
+import transformers
+
+from gptq import *
+from modelutils import *
+from quant import *
+from prettytable import PrettyTable
+import os
+import copy
+
+def get_gptj(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import GPTJForCausalLM
+    model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    model.seqlen = model.config.max_position_embeddings
+    print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad))
+    return model
+
+@torch.no_grad()
+def gptj_sequential(model, dataloader, dev, means=None, stds=None):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    #print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=False, mse=False
+            )
+        
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_sequential_delta(model, delta_model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.transformer.h
+    delta_layers = delta_model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=args.sym, mse=False
+            )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+            gptq[name].free()
+        
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = original_outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    # print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache ['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
+        try:
+            print(batch.shape)
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+    
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+
+        if args.nearest:
+            subset  = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    model.transformer.ln_f = model.transformer.ln_f.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+    
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        hidden_states = model.transformer.ln_f(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+    
+
+    model.config.use_cache = use_cache
+
+def gptj_pack(model, quantizers, wbits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [QuantLinear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name],scale,zero = quantizers[name]
+        quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
+        qlayers[name].pack(layers[name], scale, zero)
+    print('Done!')
+    return model
+
+def load_quant(model, checkpoint, wbits, groupsize):
+    from transformers import GPTJConfig, GPTJForCausalLM
+    config = GPTJConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = GPTJForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits, groupsize)
+
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done!')
+
+    return model
+
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.transformer.h):
+        layer.register_forward_hook(clear_past(i))
+
+    print('Benchmarking ...')
+
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    max_memory = 0
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+        
+            out = model(
+                input_ids[:, i:i+1],
+                past_key_values=cache['past'],
+                attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
+            )
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024)
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_keys_values)
+            del out
+        sync()
+        import numpy as np
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):',max_memory)
+
+
+        
+def main(args):
+    print(args)
+    num_params_saved_lr = 0
+    num_params = 0
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        if args.delta and args.wbits<16:
+            model = get_gptj(args.model)
+            model.eval()
+            base_model = get_gptj(args.base_model)
+            base_model.eval()
+            dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+            original_finetuned_model = copy.deepcopy(model)
+            for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+                finetuned_p.data = (finetuned_p.data-base_p.data).clone()
+        else:
+            model = get_gptj(args.model)
+            model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        if args.delta:
+            tick = time.time()
+            quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV)
+
+            comp_time = time.time()-tick
+        else:
+            quantizers = gptj_sequential(model, dataloader, DEV)
+    
+    if args.delta and args.wbits<16:
+        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+            if args.sparsify_hard_threshold:
+                print('Hard Thresholding...')
+                W = finetuned_p.data
+                finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+            if args.rank>0 and len(finetuned_p.shape) == 2:
+                print('Finding Low Rank Approximation...')
+                A = finetuned_p.data.float()
+                U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
+                A  = U @ torch.diag_embed(S) @ Vh.T
+                finetuned_p.data =  A.half()
+                num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
+            num_params += torch.numel(finetuned_p.data)
+            finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+
+    if args.benchmark:
+        model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+
+    dataset = args.dataset 
+    dataloader, testloader = get_loaders(
+        dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    
+    ppl = gptj_eval(model, testloader, DEV)
+    print(ppl)
+
+    if args.rank > 0:
+        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("Number of params without low rank ", n_params)
+        print("Number of params with low rank", n_params - num_params_saved_lr)
+    if args.save:
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save) 
+    return ppl
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        '--model', type=str, default='togethercomputer/GPT-JT-6B-v1',
+        help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.'
+    )
+    parser.add_argument(
+        '--base_model', type=str, default='EleutherAI/gpt-j-6b',
+        help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
+    )
+    parser.add_argument(
+        '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    )
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--save', type=str, default='',
+        help='Save the quantized GPT-J model under this name.'
+    )
+    parser.add_argument(
+        '--save_safetensors', type=str, default='',
+        help='Save the quantized GPT-J model as a  `.safetensors` ckpt'
+    )
+    parser.add_argument(
+        '--load', type=str, default='',
+        help='Load the quantized GPT-J model'
+    )
+    parser.add_argument(
+        '--benchmark', type=int, default=0,
+        help='Number of tokens to use for benchmarking.'
+    )
+    parser.add_argument(
+        '--check', action='store_true',
+        help='Whether to compute perpexity during benchmarking for verification.'
+    )
+    parser.add_argument(
+        '--delta', action='store_true',
+        help='Whether to use delta compression'
+    )
+    parser.add_argument(
+        '--sparsify_hard_threshold', action='store_true',
+        help='Whether to add sparsity'
+    )
+    parser.add_argument(
+        '--fraction_of_zero', type=float, default=0.99,
+        help='Sparsity ratio'
+    )
+    parser.add_argument(
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
+    )
+    parser.add_argument(
+        '--sym', action='store_true', default=True,
+        help='Whether to use symmetric quantization'
+    )
+    parser.add_argument(
+        '--trits', action='store_true', default=False, 
+        help='Whether to use trits'
+    )
+    parser.add_argument('--act_order', type=str, default=False)
+    
+    args = parser.parse_args()
+        
+    results = PrettyTable()
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [4, 3, 2]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+        print(results)
+        with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+            f.write(str(results))
+    print('finished.')
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index 2477cac..e60f1fc 100644
--- a/gptq.py
+++ b/gptq.py
@@ -126,14 +126,14 @@ def fasterquant(
             if DEBUG:
                 self.layer.weight.data[:, :i2] = Q[:, :i2]
                 self.layer.weight.data[:, i2:] = W[:, i2:]
-                print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-                print(torch.sum(Losses))
+                #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                #print(torch.sum(Losses))
 
         torch.cuda.synchronize()
         total_time = time.time() - tick
-        # print('time %.2f' % total_time)
+        # #print('time %.2f' % total_time)
         error = torch.sum(Losses).item()
-        # print('error', error)
+        # #print('error', error)
 
         if actorder:
             invperm = torch.argsort(perm)
@@ -143,8 +143,8 @@ def fasterquant(
             Q = Q.t()
         self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
         if DEBUG:
-            print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-
+            #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+            pass
     def free(self):
         if DEBUG:
             self.inp1 = None
@@ -152,4 +152,4 @@ def free(self):
         self.H = None
         self.Losses = None
         self.Trace = None
-        torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
\ No newline at end of file
diff --git a/jt_datautils/cot.py b/jt_datautils/cot.py
new file mode 100644
index 0000000..15796e9
--- /dev/null
+++ b/jt_datautils/cot.py
@@ -0,0 +1,105 @@
+import os
+import re
+import torch
+import json
+from torch.utils.data import IterableDataset, DataLoader
+from itertools import cycle, islice
+import random
+from datasets import Dataset
+from datasets import load_dataset, load_from_disk
+#from comm.comm_utils import *
+
+
+
+class StreamDataset(IterableDataset):
+    def __init__(self, cot_data_path, tokenizer, seq_length=1024):
+        
+        self.cot_data_path = cot_data_path
+        
+        with open(cot_data_path) as f:
+            self.cot_data = json.load(f)
+        
+        self.buffer_tokens = []
+        
+        self.tokenizer = tokenizer
+        self.seq_length = seq_length
+        
+        self.it = None
+        
+    def state_dict(self):
+        return {}
+    
+    def load_state_dict(self, state_dict):
+        pass
+    
+    def get_sequence_from_cot(self):
+        
+        while True:
+            
+            keys = list(self.cot_data.keys())
+            random.shuffle(keys)
+            
+            input_ids = []
+            
+            for k in keys:
+                
+                v = self.cot_data[k]
+                
+                input_ids += self.tokenizer(v + '\n\n')['input_ids']
+                if len(input_ids) < self.seq_length:
+                    continue
+                #     input_ids += [self.tokenizer.eos_token_id]*(self.seq_length - len(input_ids))
+                
+                input_ids = input_ids[:self.seq_length]
+                input_ids = torch.tensor(input_ids).long()
+                
+                yield input_ids
+                
+                input_ids = []
+        
+    def get_sequence(self):
+        
+        it_cot = cycle(self.get_sequence_from_cot())
+        
+        while True:
+            
+            input_ids = next(it_cot)
+                
+
+            yield {
+                'input_ids': input_ids,
+            }
+            
+                
+    def get_stream(self):
+        return cycle(self.get_sequence())
+    
+    def __iter__(self):
+        if self.it is None:
+            self.it = self.get_stream()
+        return self.it
+    
+    
+    
+def get_cot_train_data_loader(args, tokenizer, num_workers=0, state_dict=None):
+    
+    stream_dataset = StreamDataset(
+        './data/mmlu-cot.json',
+        tokenizer=tokenizer, seq_length=args.seq_length
+    )
+    
+    if state_dict is not None:
+        stream_dataset.load_state_dict(state_dict)
+    
+    train_data_loader = torch.utils.data.DataLoader(stream_dataset,
+                                                    batch_size=args.batch_size * args.data_group_size,
+                                                    shuffle=False,
+                                                    num_workers=num_workers,
+                                                    pin_memory=True,
+                                                    collate_fn=None)
+    return train_data_loader
+
+def get_cot_ds(data_path, tokenizer, seq_length):
+    return StreamDataset(os.path.join(data_path,'mmlu-cot.json'),
+        tokenizer=tokenizer, seq_length=seq_length
+    )
\ No newline at end of file
diff --git a/jt_datautils/pile.py b/jt_datautils/pile.py
new file mode 100644
index 0000000..fbddca5
--- /dev/null
+++ b/jt_datautils/pile.py
@@ -0,0 +1,77 @@
+
+import os
+import re
+import torch
+from torch.utils.data import IterableDataset, DataLoader
+from itertools import cycle, islice
+import random
+from datasets import Dataset
+from datasets import load_dataset, load_from_disk
+# from comm.comm_utils import *
+
+
+class StreamDataset(IterableDataset):
+    default_doc_separator = ''
+    def __init__(self, data, tokenizer, seq_length=1024, doc_separator=None):
+        self.data = data
+        self.tokenizer = tokenizer
+        self.seq_length = seq_length
+        self.doc_separator = doc_separator or StreamDataset.default_doc_separator
+        self.it = None
+        self.iter_count = 0
+        self.buffer_tokens = []
+        
+    def state_dict(self):
+        return {
+            'iter_count': self.iter_count,
+            'buffer_tokens': self.buffer_tokens,
+        }
+    
+    def load_state_dict(self, state_dict):
+        self.iter_count = state_dict['iter_count']
+        self.buffer_tokens = state_dict['buffer_tokens']
+        self.data = self.data.skip(self.iter_count)
+        
+    def get_sequence(self):
+        buffer_tokens = self.buffer_tokens
+        for x in self.data:
+            self.iter_count += 1
+            curr_tokens = self.tokenizer(self.doc_separator + x['text'])['input_ids']
+            buffer_tokens += curr_tokens
+            while len(buffer_tokens) >= self.seq_length:
+                tokens = buffer_tokens[:self.seq_length]
+                buffer_tokens = buffer_tokens[self.seq_length:]
+                input_ids = torch.tensor(tokens)
+                self.buffer_tokens = buffer_tokens # update for restore
+                yield {
+                    'input_ids': input_ids,
+                }
+                
+    def get_stream(self):
+        return cycle(self.get_sequence())
+    
+    def __iter__(self):
+        if self.it is None:
+            self.it = self.get_stream()
+        return self.it
+        
+    
+def get_pile_train_data_loader(args, tokenizer, num_workers=0, state_dict=None):
+    
+    data = load_dataset('the_pile', split="train", streaming=True).shuffle(buffer_size=10_000, seed=args.seed)
+    stream_dataset = StreamDataset(data, tokenizer, args.seq_length)
+    
+    if state_dict is not None:
+        stream_dataset.load_state_dict(state_dict)
+    
+    train_data_loader = torch.utils.data.DataLoader(stream_dataset,
+                                                    batch_size=args.batch_size * args.data_group_size,
+                                                    shuffle=False,
+                                                    num_workers=num_workers,
+                                                    pin_memory=True,
+                                                    collate_fn=None)
+    return train_data_loader
+
+def get_pile_ds(tokenizer, seq_length):
+    data = load_dataset("the_pile", split="train", streaming=True)
+    return StreamDataset(data, tokenizer, seq_length)
\ No newline at end of file
diff --git a/llama.py b/llama.py
new file mode 100644
index 0000000..f1591d9
--- /dev/null
+++ b/llama.py
@@ -0,0 +1,302 @@
+import time
+
+import torch
+import torch.nn as nn
+
+from gptq import *
+from modelutils import *
+from quant import *
+
+
+def get_llama(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = 2048
+    return model
+
+@torch.no_grad()
+def llama_sequential(model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    model.model.norm = model.model.norm.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+
+        if args.true_sequential:
+            sequential = [
+                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
+                ['self_attn.o_proj'],
+                ['mlp.up_proj', 'mlp.gate_proj'],
+                ['mlp.down_proj']
+            ]
+        else:
+            sequential = [list(full.keys())]
+       
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name])
+                gptq[name].quantizer = Quantizer()
+                gptq[name].quantizer.configure(
+                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                )
+
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+
+            for name in subset:
+                print(i, name)
+                print('Quantizing ...')
+                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+                gptq[name].free()
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+    
+    return quantizers
+
+@torch.no_grad()
+def llama_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.norm is not None:
+        model.model.norm = model.model.norm.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.norm is not None:
+            hidden_states = model.model.norm(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+
+    model.config.use_cache = use_cache
+
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        'model', type=str,
+        help='LlaMa model to load; pass location of hugginface converted checkpoint.'
+    )
+    parser.add_argument(
+        'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    ) 
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--sym', action='store_true',
+        help='Whether to perform symmetric quantization.'
+    )
+    parser.add_argument(
+        '--new-eval', action='store_true',
+        help='Whether to use the new PTB and C4 eval.'
+    )
+    parser.add_argument(
+        '--act-order', action='store_true',
+        help='Whether to apply the activation order GPTQ heuristic'
+    )
+    parser.add_argument(
+        '--true-sequential', action='store_true',
+        help='Whether to run in true sequential model.'
+    )
+
+    args = parser.parse_args()
+
+    model = get_llama(args.model)
+    model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = llama_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    datasets = ['wikitext2', 'ptb', 'c4'] 
+    if args.new_eval:
+      datasets = ['wikitext2', 'ptb-new', 'c4-new']
+    for dataset in datasets:
+        dataloader, testloader = get_loaders(
+            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+        )
+        print(dataset)
+        llama_eval(model, testloader, DEV)
\ No newline at end of file
diff --git a/llama_delta.py b/llama_delta.py
new file mode 100644
index 0000000..d8fa99a
--- /dev/null
+++ b/llama_delta.py
@@ -0,0 +1,441 @@
+import time
+
+import torch
+import torch.nn as nn
+
+from gptq import *
+from modelutils import *
+from quant import *
+import copy
+import os
+
+def hard_threshold(x, fraction_of_zero=0.1):
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    return mask * x
+
+def get_llama(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = 2048
+    return model
+
+@torch.no_grad()
+def llama_sequential_delta(model, delta_model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+
+    layers = model.model.layers
+    delta_layers = delta_model.model.layers
+    
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    model.model.norm = model.model.norm.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+    
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+
+        if args.true_sequential:
+            sequential = [
+                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
+                ['self_attn.o_proj'],
+                ['mlp.up_proj', 'mlp.gate_proj'],
+                ['mlp.down_proj']
+            ]
+        else:
+            sequential = [list(full.keys())]
+       
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name])
+                gptq[name].quantizer = Quantizer()
+                gptq[name].quantizer.configure(
+                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                )
+
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+
+            for name in subset:
+                print(i, name)
+                print('Quantizing ...')
+                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+                gptq[name].free()
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+    
+    return quantizers
+
+@torch.no_grad()
+def llama_sequential(model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    model.model.norm = model.model.norm.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+
+        if args.true_sequential:
+            sequential = [
+                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
+                ['self_attn.o_proj'],
+                ['mlp.up_proj', 'mlp.gate_proj'],
+                ['mlp.down_proj']
+            ]
+        else:
+            sequential = [list(full.keys())]
+       
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name])
+                gptq[name].quantizer = Quantizer()
+                gptq[name].quantizer.configure(
+                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                )
+
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+
+            for name in subset:
+                print(i, name)
+                print('Quantizing ...')
+                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+                gptq[name].free()
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+    
+    return quantizers
+
+@torch.no_grad()
+def llama_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.norm is not None:
+        model.model.norm = model.model.norm.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.norm is not None:
+            hidden_states = model.model.norm(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+
+    model.config.use_cache = use_cache
+
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--model', type=str, default='ausboss/llama-13b-supercot',
+        help='LlaMa model to load; pass location of hugginface converted checkpoint.'
+    )
+    parser.add_argument(
+        '--base-model', type=str, default='yahma/llama-13b-hf',
+        help='base LLAMA model to load'
+    )
+    parser.add_argument(
+        '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    ) 
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--sym', action='store_true',
+        help='Whether to perform symmetric quantization.'
+    )
+    parser.add_argument(
+        '--new-eval', action='store_true',
+        help='Whether to use the new PTB and C4 eval.'
+    )
+    parser.add_argument(
+        '--act-order', action='store_true',
+        help='Whether to apply the activation order GPTQ heuristic'
+    )
+    parser.add_argument(
+        '--true-sequential', action='store_true',
+        help='Whether to run in true sequential model.'
+    )
+    parser.add_argument(
+        '--sparsify_hard_threshold', action='store_true',
+        help='Whether to add sparsity'
+    )
+    parser.add_argument(
+        '--fraction_of_zero', type=float, default=0.99,
+        help='Sparsity ratio'
+    )
+    args = parser.parse_args()
+
+    base_model = get_llama(args.base_model)
+    model = get_llama(args.model)
+    model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    original_finetuned_model = copy.deepcopy(model)
+    _ = llama_sequential_delta(original_finetuned_model, model, dataloader, DEV)
+    for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+        finetuned_p.data = (finetuned_p.data-base_p.data).clone()
+                
+    
+    for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+        if args.sparsify_hard_threshold:
+            print('Hard Thresholding...')
+            W = finetuned_p.data
+            finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+        finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+            
+    datasets = ['wikitext2', 'ptb', 'c4'] 
+    if args.new_eval:
+      datasets = ['wikitext2', 'ptb-new', 'c4-new']
+    for dataset in datasets:
+        dataloader, testloader = get_loaders(
+            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+        )
+        print(dataset)
+        llama_eval(model, testloader, DEV)
\ No newline at end of file
diff --git a/modelutils.py b/modelutils.py
index c93410d..5b36877 100644
--- a/modelutils.py
+++ b/modelutils.py
@@ -1,8 +1,10 @@
 import torch
 import torch.nn as nn
-from transformers import OPTForCausalLM
+
+
 DEV = torch.device('cuda:0')
 
+
 def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
     if type(module) in layers:
         return {name: module}
@@ -11,16 +13,4 @@ def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
         res.update(find_layers(
             child, layers=layers, name=name + '.' + name1 if name != '' else name1
         ))
-    return res
-
-def get_opt(model):
-    def skip(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = skip
-    torch.nn.init.uniform_ = skip
-    torch.nn.init.normal_ = skip
-    
-    # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
-    model.seqlen = model.config.max_position_embeddings
-    return model
\ No newline at end of file
+    return res
\ No newline at end of file
diff --git a/opt.py b/opt.py
index edf40bc..77f0f9b 100644
--- a/opt.py
+++ b/opt.py
@@ -6,7 +6,7 @@
 from gptq import *
 from modelutils import *
 from quant import quantize, Quantizer, Quant3Linear, make_quant3
-
+from prettytable import PrettyTable
 def get_opt(model):
     import torch
     def skip(*args, **kwargs):
@@ -223,6 +223,7 @@ def forward(self, inp, **kwargs):
     print(ppl.item())
 
     model.config.use_cache = use_cache
+    return ppl.item()
 
 # TODO: perform packing on GPU
 def opt_pack3(model, quantizers):
@@ -351,6 +352,48 @@ def sync():
             print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
 
 
+def main(args):
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        model = get_opt(args.model)
+        model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = opt_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            opt_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+    
+
+    dataloader, testloader = get_loaders(
+        args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    print(dataset)
+    ppl = opt_eval(model, testloader, DEV)
+    
+    if args.save:
+        opt_pack3(model, quantizers)
+        torch.save(model.state_dict(), args.save)
+        
+    return ppl
+
+        
 if __name__ == '__main__':
     import argparse
     from datautils import *
@@ -358,11 +401,11 @@ def sync():
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        'model', type=str,
+        '--model', type=str, default='lnair/opt-1.3b-wikitext2',
         help='OPT model to load; pass `facebook/opt-X`.'
     )
     parser.add_argument(
-        'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
+        '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
         help='Where to extract calibration data from.'
     )
     parser.add_argument(
@@ -427,44 +470,17 @@ def sync():
     )
 
     args = parser.parse_args()
-
-    if args.load:
-        model = load_quant3(args.model, args.load)
-    else:
-        model = get_opt(args.model)
-        model.eval()
-
-    dataloader, testloader = get_loaders(
-        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
-    )
-
-    if args.wbits < 16 and not args.nearest:
-        tick = time.time()
-        quantizers = opt_sequential(model, dataloader, DEV)
-        print(time.time() - tick)
-
-    if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            opt_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
-        if args.benchmark:
-            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
-            benchmark(model, input_ids, check=args.check)
-    if args.load:
-        exit()
-
-    datasets = ['wikitext2', 'ptb', 'c4'] 
-    if args.new_eval:
-      datasets = ['wikitext2', 'ptb-new', 'c4-new']
-    for dataset in datasets: 
-        dataloader, testloader = get_loaders(
-            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
-        )
-        print(dataset)
-        opt_eval(model, testloader, DEV)
-
-    if args.save:
-        opt_pack3(model, quantizers)
-        torch.save(model.state_dict(), args.save) 
+    
+    results = PrettyTable()
+    results.field_names = ['Bits', 'wiki', 'ptb', 'c4']
+    for n_bits in [4, 3, 2]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, ppls[0], ppls[1], ppls[2]])
+    print(results)
+    print('finished.')
diff --git a/opt_delta.py b/opt_delta.py
index f9f6bbf..481cc9a 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -7,8 +7,9 @@
 from gptq import *
 from modelutils import *
 from quant import *
-
+from prettytable import PrettyTable
 import copy
+import os
 #from prettytable import PrettyTable
 
 def get_opt(model):
@@ -539,16 +540,17 @@ def main(args):
         dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
     )
     
-    ppl = opt_eval(model, testloader, DEV)
-    print(ppl)
+    # ppl = opt_eval(model, testloader, DEV)
+    # print(ppl)
 
     if args.rank > 0:
-        print("Number of params without low rank ", num_params)
-        print("Number of params with low rank", num_params - num_params_saved_lr)
+        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("Number of params without low rank ", n_params)
+        print("Number of params with low rank", n_params - num_params_saved_lr)
     if args.save:
         opt_pack3(model, quantizers)
         torch.save(model.state_dict(), args.save) 
-
+    return ppl, n_params, comp_time
 
 if __name__ == '__main__':
     import argparse
@@ -601,13 +603,17 @@ def main(args):
         help='Whether to perform symmetric quantization.'
     )
     parser.add_argument(
-        '--save', type=str, default='',
+        '--save', type=str, default='opt-1.3b-wikitext2-wbits2.pt',
         help='Save quantized checkpoint under this name.'
     )
     parser.add_argument(
         '--load', type=str, default='',
         help='Load quantized model.'
     )
+    parser.add_argument(
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
+    )
     parser.add_argument(
         '--benchmark', type=int, default=0,
         help='Number of tokens to use for benchmarking.'
@@ -647,8 +653,18 @@ def main(args):
     )
     args = parser.parse_args()
 
-    #results = PrettyTable()
-
-    main(args)
-    
+    results = PrettyTable()
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [2]:
+        ppls = []
+        for dataset in ['wikitext2']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'opt-not-delta1.3b-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl, n_params, comp_time = main(args)
+        #     ppls.append(ppl)
+        # results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+        # print(results)
+        # with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+        #     f.write(str(results))
     print('finished.')
diff --git a/quant.py b/quant.py
index f8cc1b7..f57d6d7 100644
--- a/quant.py
+++ b/quant.py
@@ -131,8 +131,8 @@ def ready(self):
 try:
     import quant_cuda
 except:
-    print('CUDA extension not installed.')
-
+    #print('CUDA extension not installed.')
+    pass
 # Assumes layer is perfectly divisible into 1024 * 1024 blocks
 class Quant3Linear(nn.Module): 
 
@@ -356,4 +356,4 @@ def make_quant(module, names, bits, groupsize, name=''):
                 module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features)
             )
     for name1, child in module.named_children():
-        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
+        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 7417000..321525d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
-transformers
-loguru
-datasets
\ No newline at end of file
+safetensors==0.3.0
+datasets==1.17.0
+sentencepiece
+transformers==4.21.2
+ninja
\ No newline at end of file
diff --git a/src/fmzip b/src/fmzip
new file mode 160000
index 0000000..b41e785
--- /dev/null
+++ b/src/fmzip
@@ -0,0 +1 @@
+Subproject commit b41e7856f092c80286577b2eb5e1294a764099d6