From 3b2997968a24050f00d5102ac32dbebb66ba0c78 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Tue, 9 May 2023 15:52:43 +0000
Subject: [PATCH 01/23] better decomposition

---
 cli.py              |  2 +-
 core_compression.py |  4 +--
 decomposition.py    | 60 +++++++++++++++++++++++++++++++++++++++++++++
 matq.py             | 13 +++++-----
 scripts/lr_quant.sh | 20 +++++++++++++++
 5 files changed, 90 insertions(+), 9 deletions(-)
 create mode 100644 decomposition.py

diff --git a/cli.py b/cli.py
index b1bf8df..96eae8b 100644
--- a/cli.py
+++ b/cli.py
@@ -81,6 +81,6 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
     )
     if args.save:
         save_lr_tensors(lr_tensors, f"outputs/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors")
-    
+
     ppl = opt_eval(target_model, loader_enc, args, target_model.device)
     logger.info(f"Perplexity: {ppl}")
\ No newline at end of file
diff --git a/core_compression.py b/core_compression.py
index 2b445f7..f69cbba 100644
--- a/core_compression.py
+++ b/core_compression.py
@@ -1,9 +1,9 @@
 import torch
 import torch.nn as nn
-from loguru import logger
-from modelutils import find_layers
 from matq import TensorQ
+from loguru import logger
 from quant import Quantizer
+from modelutils import find_layers
 
 @torch.no_grad()
 def opt_delta_lr(
diff --git a/decomposition.py b/decomposition.py
new file mode 100644
index 0000000..ec531af
--- /dev/null
+++ b/decomposition.py
@@ -0,0 +1,60 @@
+import torch
+import time
+from loguru import logger
+
+def pca_decomposition(matrix, rank):
+    U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=5)
+    return U, torch.diag_embed(S) @ Vh.T
+def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5):
+    # Initialize random matrices U and V
+    m, n = matrix.shape
+    U = torch.rand(m, rank)
+    V = torch.rand(rank, n)
+    U.to(matrix.device)
+    V.to(matrix.device)
+    tick = time.time()
+    early_stop = False
+    for i in range(max_iterations):
+        # Calculate the difference between the original and reconstructed matrices
+        difference = matrix - U @ V
+        
+        # Calculate the gradients
+        gradient_U = -2 * (difference @ V.T)
+        gradient_V = -2 * (U.T @ difference)
+        U -= learning_rate * gradient_U
+        V -= learning_rate * gradient_V
+        if torch.norm(difference) < tolerance:
+            early_stop = True
+            break
+    if not early_stop:
+        logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}")
+    else:
+        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}")
+    return U, V
+
+if __name__=="__main__":
+    matrix = torch.rand((2048,2048))
+    print("Original matrix:")
+    print(matrix)
+    rank = 32
+    U, V = low_rank_decomposition(
+        matrix,
+        rank,
+        learning_rate=1e-6,
+        max_iterations=100000,
+    )
+    U_pca, V_pca = pca_decomposition(matrix, rank)
+    reconstructed_matrix_pca = U_pca @ V_pca
+
+    # print("U:")
+    # print(U)
+    # print("V:")
+    # print(V)
+
+    reconstructed_matrix = U @ V
+    print("Reconstructed matrix:")
+    print(reconstructed_matrix)
+
+    print("difference:")
+    print(torch.norm(matrix - reconstructed_matrix))
+    print(torch.norm(matrix - reconstructed_matrix_pca))
\ No newline at end of file
diff --git a/matq.py b/matq.py
index 21ab6bc..030d06f 100644
--- a/matq.py
+++ b/matq.py
@@ -5,7 +5,7 @@
 import transformers
 from loguru import logger
 from quant import quantize
-
+from decomposition import low_rank_decomposition
 
 DEBUG = False 
 
@@ -84,11 +84,12 @@ def decompose(self):
         W = W.float()
         logger.info("starting decomposition")
         tick = time.time()
-        U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5)
-        # let's say L = U
-        # and R = diag(S)*V.T
-        self.L = U
-        self.R = torch.diag_embed(S) @ Vh.T
+        # U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5)
+        # # let's say L = U
+        # # and R = diag(S)*V.T
+        # self.L = U
+        # self.R = torch.diag_embed(S) @ Vh.T
+        self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=100000)
         logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}")
 
     def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32):
diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh
index ccfc5a5..9503a9b 100644
--- a/scripts/lr_quant.sh
+++ b/scripts/lr_quant.sh
@@ -6,4 +6,24 @@ python cli.py \
     --rank 32 \
     --save outputs/ \
     --nsamples 128 \
+    --wbits 8
+
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 128 \
+    --save outputs/ \
+    --nsamples 128 \
+    --wbits 8
+
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 256 \
+    --save outputs/ \
+    --nsamples 128 \
     --wbits 8
\ No newline at end of file

From f4dfd8292cc751e6c2447292aa9bf38efdda16b1 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Tue, 9 May 2023 18:36:42 +0000
Subject: [PATCH 02/23] gradient descent decomposition

---
 cli.py              |  2 +-
 core_compression.py |  5 +++--
 decomposition.py    | 14 ++++++--------
 matq.py             | 10 +++++-----
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/cli.py b/cli.py
index 96eae8b..dece358 100644
--- a/cli.py
+++ b/cli.py
@@ -80,7 +80,7 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
         args.nsamples
     )
     if args.save:
-        save_lr_tensors(lr_tensors, f"outputs/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors")
+        save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors")
 
     ppl = opt_eval(target_model, loader_enc, args, target_model.device)
     logger.info(f"Perplexity: {ppl}")
\ No newline at end of file
diff --git a/core_compression.py b/core_compression.py
index f69cbba..77098ea 100644
--- a/core_compression.py
+++ b/core_compression.py
@@ -55,6 +55,7 @@ def forward(self, inp, **kwargs):
         except ValueError:
             pass
     layers[0] = layers[0].module
+    
     model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
     model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
     if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
@@ -108,10 +109,10 @@ def temp(_, inp, out):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 
             original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        
+
         for h in handles:
             h.remove()
-        
+
         for name in subset:
             logger.info(f"Quantizing {name}...")
             lr_gptq[name].lr_quant(
diff --git a/decomposition.py b/decomposition.py
index ec531af..d5068b8 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -3,15 +3,14 @@
 from loguru import logger
 
 def pca_decomposition(matrix, rank):
-    U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=5)
+    U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=500)
     return U, torch.diag_embed(S) @ Vh.T
+
 def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5):
     # Initialize random matrices U and V
     m, n = matrix.shape
-    U = torch.rand(m, rank)
-    V = torch.rand(rank, n)
-    U.to(matrix.device)
-    V.to(matrix.device)
+    U = torch.rand(m, rank, device=matrix.device)
+    V = torch.rand(rank, n, device=matrix.device)
     tick = time.time()
     early_stop = False
     for i in range(max_iterations):
@@ -26,6 +25,7 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
         if torch.norm(difference) < tolerance:
             early_stop = True
             break
+        
     if not early_stop:
         logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}")
     else:
@@ -33,9 +33,7 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
     return U, V
 
 if __name__=="__main__":
-    matrix = torch.rand((2048,2048))
-    print("Original matrix:")
-    print(matrix)
+    matrix = torch.rand((128,128))
     rank = 32
     U, V = low_rank_decomposition(
         matrix,
diff --git a/matq.py b/matq.py
index 030d06f..13dbbf5 100644
--- a/matq.py
+++ b/matq.py
@@ -92,14 +92,14 @@ def decompose(self):
         self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=100000)
         logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}")
 
-    def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32):
-        self.lr_quant_R(blocksize, percdamp, groupsize, actorder, rank)
-        self.lr_quant_L(blocksize, percdamp, groupsize, actorder, rank)
+    def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False):
+        self.lr_quant_R(blocksize, percdamp, groupsize, actorder)
+        self.lr_quant_L(blocksize, percdamp, groupsize, actorder)
         # restored weight is L@R
         # but on disk we only save L, R
         self.layer.weight.data = (self.L @ self.R).reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
         
-    def lr_quant_R(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32):
+    def lr_quant_R(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False):
         R = self.R.data.clone()
         if isinstance(self.layer, nn.Conv2d):
             R = R.flatten(1)
@@ -186,7 +186,7 @@ def lr_quant_R(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False,
             Q_R = Q_R.t()
         self.R = Q_R.reshape(self.R.shape).to(self.R.dtype)
 
-    def lr_quant_L(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32):
+    def lr_quant_L(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False):
         L = self.L.data.clone()
         if isinstance(self.layer, nn.Conv2d):
             L = L.flatten(1)

From 5f00d37f4949a9fb3f99d4ec5e022dc44d72a29d Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Tue, 9 May 2023 18:43:31 +0000
Subject: [PATCH 03/23] updating decomposition

---
 decomposition.py | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/decomposition.py b/decomposition.py
index d5068b8..5c90042 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -2,26 +2,29 @@
 import time
 from loguru import logger
 
-def pca_decomposition(matrix, rank):
-    U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=500)
+def pca_decomposition(matrix, rank, niter=500):
+    U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=niter)
     return U, torch.diag_embed(S) @ Vh.T
 
 def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5):
     # Initialize random matrices U and V
-    m, n = matrix.shape
-    U = torch.rand(m, rank, device=matrix.device)
-    V = torch.rand(rank, n, device=matrix.device)
+    # m, n = matrix.shape
+    # let's choose a good start point?
+    # L, R = pca_decomposition(matrix, rank)
+    # random seems to work better generally
+    L = torch.rand((matrix.shape[0], rank))
+    R = torch.rand((rank, matrix.shape[1]))
     tick = time.time()
     early_stop = False
     for i in range(max_iterations):
         # Calculate the difference between the original and reconstructed matrices
-        difference = matrix - U @ V
+        difference = matrix - L @ R
         
         # Calculate the gradients
-        gradient_U = -2 * (difference @ V.T)
-        gradient_V = -2 * (U.T @ difference)
-        U -= learning_rate * gradient_U
-        V -= learning_rate * gradient_V
+        gradient_L = -2 * (difference @ R.T)
+        gradient_R = -2 * (L.T @ difference)
+        L -= learning_rate * gradient_L
+        R -= learning_rate * gradient_R
         if torch.norm(difference) < tolerance:
             early_stop = True
             break
@@ -30,26 +33,22 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
         logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}")
     else:
         logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}")
-    return U, V
+    return L, R
 
 if __name__=="__main__":
-    matrix = torch.rand((128,128))
-    rank = 32
-    U, V = low_rank_decomposition(
+    matrix = torch.rand((16,16))
+    rank = 4
+    L, R = low_rank_decomposition(
         matrix,
         rank,
         learning_rate=1e-6,
         max_iterations=100000,
     )
-    U_pca, V_pca = pca_decomposition(matrix, rank)
-    reconstructed_matrix_pca = U_pca @ V_pca
+    L_pca, R_pca = pca_decomposition(matrix, rank)
+    reconstructed_matrix_pca = L_pca @ R_pca
 
-    # print("U:")
-    # print(U)
-    # print("V:")
-    # print(V)
+    reconstructed_matrix = L @ R
 
-    reconstructed_matrix = U @ V
     print("Reconstructed matrix:")
     print(reconstructed_matrix)
 

From b080b3c6934e67d349a2d509ebd6b9a0287c699b Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Tue, 9 May 2023 20:16:25 +0000
Subject: [PATCH 04/23] testing on different decomposition

---
 core_compression.py          |   4 +-
 core_compression_parallel.py | 143 +++++++++++++++++++++++++++++++++++
 decomposition.py             |  17 +++--
 scripts/lr_quant.sh          |   8 +-
 scripts/playground.ipynb     |  53 ++++++++++++-
 to_hf.py                     |   2 +-
 6 files changed, 211 insertions(+), 16 deletions(-)
 create mode 100644 core_compression_parallel.py

diff --git a/core_compression.py b/core_compression.py
index 77098ea..7d6b1f8 100644
--- a/core_compression.py
+++ b/core_compression.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+from tqdm import tqdm
 from matq import TensorQ
 from loguru import logger
 from quant import Quantizer
@@ -72,7 +73,8 @@ def forward(self, inp, **kwargs):
     quantizers = {}
     l_quantizers = {}
     lr_tensors = {}
-    for i in range(len(delta_layers)):
+    # parallelize this to allocate to multiple GPUs
+    for i in tqdm(range(len(delta_layers))):
         layer = delta_layers[i].to(device)
         original_layer = layers[i].to(device)
 
diff --git a/core_compression_parallel.py b/core_compression_parallel.py
new file mode 100644
index 0000000..fca3c7d
--- /dev/null
+++ b/core_compression_parallel.py
@@ -0,0 +1,143 @@
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from matq import TensorQ
+from loguru import logger
+from quant import Quantizer
+from modelutils import find_layers
+import multiprocessing as mp
+@torch.no_grad()
+def opt_delta_lr(
+        model,
+        delta_model,
+        dataloader,
+        nsamples,
+        wbits,
+        sym,
+        trits,
+        rank,
+        args
+    ):
+    device = model.device
+    print("Starting LR quantizer initialization...")
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    delta_layers = delta_model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(device)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(device)
+
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(device)
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(device)
+    layers[0] = layers[0].to(device)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=device
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(device))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    
+    logger.info("Ready, creating lr quantizers...")
+    quantizers = {}
+    l_quantizers = {}
+    lr_tensors = {}
+    # parallelize this to allocate to multiple GPUs
+    def process_layer(i, device):
+        layer = delta_layers[i].to(device)
+        original_layer = layers[i].to(device)
+        subset = find_layers(layer)
+        lr_gptq = {}
+        for name in subset:
+            lr_gptq[name] = TensorQ(subset[name], rank)
+            lr_gptq[name].quantizer = Quantizer()
+            lr_gptq[name].quantizer.configure(
+                wbits,
+                perchannel=True,
+                sym=sym,
+                mse=False,
+                trits = trits,
+            )
+            lr_gptq[name].l_quantizer = Quantizer()
+            lr_gptq[name].l_quantizer.configure(
+                wbits,
+                perchannel=True,
+                sym=sym,
+                mse=False,
+                trits = trits,
+            )
+        def add_batch(name):
+            def temp(_, inp, out):
+                lr_gptq[name].add_batch_lr(inp[0].data, out.data)
+            return temp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        
+        for h in handles:
+            h.remove()
+        
+        for name in subset:
+            logger.info(f"Quantizing {name}...")
+            lr_gptq[name].lr_quant(
+                percdamp = args['percdamp'],
+                groupsize = args['groupsize'],
+                actorder = args['actorder'],
+            )
+            lr_tensors[f'<R>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R
+            lr_tensors[f'<L>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L
+            
+            quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].quantizer
+            l_quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].l_quantizer
+            lr_gptq[name].free()
+        
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        del lr_gptq
+        torch.cuda.empty_cache()
+        inps, outs = original_outs, inps
+    num_workers = torch.cuda.device_count()
+    logger.info(f"Using {num_workers} workers...")
+    with mp.Pool(num_workers) as p:
+        p.starmap(process_layer, [(i, f'cuda:{i}') for i in range(num_workers)])
+
+    model.config.use_cache = use_cache
+    return quantizers, l_quantizers, lr_tensors
\ No newline at end of file
diff --git a/decomposition.py b/decomposition.py
index 5c90042..b2df2ef 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -2,9 +2,9 @@
 import time
 from loguru import logger
 
-def pca_decomposition(matrix, rank, niter=500):
-    U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=niter)
-    return U, torch.diag_embed(S) @ Vh.T
+def svd_decomposition(matrix, rank, niter=500):
+    U, S, Vh = torch.svd_lowrank(matrix, q=rank)
+    return U @ torch.diag_embed(S),  Vh.T
 
 def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5):
     # Initialize random matrices U and V
@@ -36,21 +36,26 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
     return L, R
 
 if __name__=="__main__":
-    matrix = torch.rand((16,16))
-    rank = 4
+    matrix = torch.rand((1024,1024))
+    # matrix = torch.tensor([[1., 2., 3., 4.],[5., 6., 7., 8.],[9., 10., 11., 12.],[13., 14., 15., 16.]])
+    rank = 32
+    print("Original matrix:")
+    print(matrix)
     L, R = low_rank_decomposition(
         matrix,
         rank,
         learning_rate=1e-6,
         max_iterations=100000,
     )
-    L_pca, R_pca = pca_decomposition(matrix, rank)
+    L_pca, R_pca = svd_decomposition(matrix, rank)
     reconstructed_matrix_pca = L_pca @ R_pca
 
     reconstructed_matrix = L @ R
 
     print("Reconstructed matrix:")
     print(reconstructed_matrix)
+    print("Reconstructed matrix (pca):")
+    print(reconstructed_matrix_pca)
 
     print("difference:")
     print(torch.norm(matrix - reconstructed_matrix))
diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh
index 9503a9b..6df8ed9 100644
--- a/scripts/lr_quant.sh
+++ b/scripts/lr_quant.sh
@@ -3,7 +3,7 @@ python cli.py \
     --target-model lnair/opt-1.3b-wikitext2 \
     --base-model facebook/opt-1.3b \
     --delta \
-    --rank 32 \
+    --rank 128 \
     --save outputs/ \
     --nsamples 128 \
     --wbits 8
@@ -13,7 +13,7 @@ python cli.py \
     --target-model lnair/opt-1.3b-wikitext2 \
     --base-model facebook/opt-1.3b \
     --delta \
-    --rank 128 \
+    --rank 256 \
     --save outputs/ \
     --nsamples 128 \
     --wbits 8
@@ -23,7 +23,7 @@ python cli.py \
     --target-model lnair/opt-1.3b-wikitext2 \
     --base-model facebook/opt-1.3b \
     --delta \
-    --rank 256 \
+    --rank 128 \
     --save outputs/ \
     --nsamples 128 \
-    --wbits 8
\ No newline at end of file
+    --wbits 4
\ No newline at end of file
diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb
index 748f8cb..a6648c0 100644
--- a/scripts/playground.ipynb
+++ b/scripts/playground.ipynb
@@ -2,22 +2,67 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import sys\n",
     "seed=42\n",
     "target_model_name = \"lnair/opt-1.3b-wikitext2\"\n",
     "base_model_name = \"facebook/opt-1.3b\"\n",
     "n_samples = 128\n",
-    "dataset = 'wikitext2'"
+    "dataset = 'wikitext2'\n",
+    "sys.path.append('..')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "OPTForCausalLM(\n",
+       "  (model): OPTModel(\n",
+       "    (decoder): OPTDecoder(\n",
+       "      (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n",
+       "      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n",
+       "      (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
+       "      (layers): ModuleList(\n",
+       "        (0-23): 24 x OPTDecoderLayer(\n",
+       "          (self_attn): OPTAttention(\n",
+       "            (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
+       "            (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
+       "            (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
+       "            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
+       "          )\n",
+       "          (activation_fn): ReLU()\n",
+       "          (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
+       "          (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
+       "          (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
+       "          (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from modelutils import get_opt\n",
     "base_model = get_opt(base_model_name)\n",
diff --git a/to_hf.py b/to_hf.py
index 10e0be6..e47318e 100644
--- a/to_hf.py
+++ b/to_hf.py
@@ -12,7 +12,7 @@
 from copy import deepcopy
 target_model = deepcopy(base_model)
 
-tensors = load_lr_tensors("outputs/model.safetensors")
+tensors = load_lr_tensors("outputs/lnair.opt-1.3b-wikitext2-r32-w8-lr.safetensors")
 
 target_layers = target_model.model.decoder.layers
 

From 02eee606d694b0f65798f9c89146577272d5fcc5 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Wed, 10 May 2023 06:58:44 +0000
Subject: [PATCH 05/23] now decomposition takes input into account

---
 decomposition.py      | 75 +++++++++++++++++++++++++++----------------
 lr_only.py            |  0
 scripts/lr_quant_2.sh | 29 +++++++++++++++++
 to_hf.py              |  8 ++---
 4 files changed, 81 insertions(+), 31 deletions(-)
 create mode 100644 lr_only.py
 create mode 100644 scripts/lr_quant_2.sh

diff --git a/decomposition.py b/decomposition.py
index b2df2ef..535f684 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -3,32 +3,47 @@
 from loguru import logger
 
 def svd_decomposition(matrix, rank, niter=500):
-    U, S, Vh = torch.svd_lowrank(matrix, q=rank)
+    U, S, Vh = torch.pca_lowrank(matrix, q=rank)
     return U @ torch.diag_embed(S),  Vh.T
 
-def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5):
+def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, input_matrix = None):
     # Initialize random matrices U and V
     # m, n = matrix.shape
     # let's choose a good start point?
     # L, R = pca_decomposition(matrix, rank)
     # random seems to work better generally
-    L = torch.rand((matrix.shape[0], rank))
-    R = torch.rand((rank, matrix.shape[1]))
+    L = torch.rand((matrix.shape[0], rank), device=matrix.device)
+    R = torch.rand((rank, matrix.shape[1]), device=matrix.device)
+
     tick = time.time()
     early_stop = False
-    for i in range(max_iterations):
-        # Calculate the difference between the original and reconstructed matrices
-        difference = matrix - L @ R
-        
-        # Calculate the gradients
-        gradient_L = -2 * (difference @ R.T)
-        gradient_R = -2 * (L.T @ difference)
-        L -= learning_rate * gradient_L
-        R -= learning_rate * gradient_R
-        if torch.norm(difference) < tolerance:
-            early_stop = True
-            break
-        
+    if input_matrix is None:
+        for i in range(max_iterations):
+            # Calculate the difference between the original and reconstructed matrices
+            difference = matrix - L @ R
+
+            # Calculate the gradients
+            gradient_L = -2 * (difference @ R.T)
+            gradient_R = -2 * (L.T @ difference)
+            L -= learning_rate * gradient_L
+            R -= learning_rate * gradient_R
+            if torch.norm(difference) < tolerance:
+                early_stop = True
+                break
+    else:
+        W = matrix
+        X = input_matrix
+        for i in range(max_iterations):
+
+            gradient_L = -2 * (matrix@X - L @ (R @ X)) @ ((R@X).T)
+            # gradient_L = -2 * np.dot((WX - np.dot(U, np.dot(V, X))), np.dot(V, X).T)
+            gradient_R = -2 * (L.T @ (W@X - L @ (R @ X)))
+            # gradient_V = -2 * np.dot(U.T, (WX - np.dot(U, np.dot(V, X))))
+            L -= learning_rate * gradient_L
+            R -= learning_rate * gradient_R
+            if torch.norm(W@X - L @ (R @ X)) < tolerance:
+                early_stop = True
+                break
     if not early_stop:
         logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}")
     else:
@@ -38,25 +53,31 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
 if __name__=="__main__":
     matrix = torch.rand((1024,1024))
     # matrix = torch.tensor([[1., 2., 3., 4.],[5., 6., 7., 8.],[9., 10., 11., 12.],[13., 14., 15., 16.]])
-    rank = 32
-    print("Original matrix:")
+    input_matrix = torch.rand((1024,16))
+    
     print(matrix)
+    rank = 32
+    print("Original output:")
+    original_output = matrix @ input_matrix
+    print(original_output)
     L, R = low_rank_decomposition(
         matrix,
         rank,
         learning_rate=1e-6,
         max_iterations=100000,
     )
-    L_pca, R_pca = svd_decomposition(matrix, rank)
-    reconstructed_matrix_pca = L_pca @ R_pca
+    # L_pca, R_pca = svd_decomposition(matrix, rank)
+    # reconstructed_matrix_pca = L_pca @ R_pca
+    print(L.shape)
+    print(R.shape)
 
-    reconstructed_matrix = L @ R
+    reconstructed_matrix = L @ R @ input_matrix
 
-    print("Reconstructed matrix:")
+    print("Reconstructed output:")
     print(reconstructed_matrix)
-    print("Reconstructed matrix (pca):")
-    print(reconstructed_matrix_pca)
+    # print("Reconstructed matrix (pca):")
+    # print(reconstructed_matrix_pca)
 
     print("difference:")
-    print(torch.norm(matrix - reconstructed_matrix))
-    print(torch.norm(matrix - reconstructed_matrix_pca))
\ No newline at end of file
+    print(torch.norm(original_output - reconstructed_matrix))
+    #print(torch.norm(matrix - reconstructed_matrix_pca))
\ No newline at end of file
diff --git a/lr_only.py b/lr_only.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/lr_quant_2.sh b/scripts/lr_quant_2.sh
new file mode 100644
index 0000000..19865ec
--- /dev/null
+++ b/scripts/lr_quant_2.sh
@@ -0,0 +1,29 @@
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 512 \
+    --save outputs/ \
+    --nsamples 128 \
+    --wbits 8
+
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 512 \
+    --save outputs/ \
+    --nsamples 128 \
+    --wbits 4
+
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 1024 \
+    --save outputs/ \
+    --nsamples 128 \
+    --wbits 8
\ No newline at end of file
diff --git a/to_hf.py b/to_hf.py
index e47318e..155b349 100644
--- a/to_hf.py
+++ b/to_hf.py
@@ -11,8 +11,8 @@
 from modelutils import find_layers
 from copy import deepcopy
 target_model = deepcopy(base_model)
-
-tensors = load_lr_tensors("outputs/lnair.opt-1.3b-wikitext2-r32-w8-lr.safetensors")
+MODEL_ID = "lnair.opt-1.3b-wikitext2-r128-w8-lr"
+tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors")
 
 target_layers = target_model.model.decoder.layers
 
@@ -27,6 +27,6 @@
         layer[layer_id].weight.data = new_weight
 
 # save target model as HF
-target_model.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8")
+target_model.save_pretrained(f"outputs/{MODEL_ID}")
 tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-tokenizer.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8")
\ No newline at end of file
+tokenizer.save_pretrained(f"outputs/{MODEL_ID}")
\ No newline at end of file

From 1eb5a11bc7f1174f6640cbbafc8fb03e4e41fd20 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Wed, 10 May 2023 08:09:18 +0000
Subject: [PATCH 06/23] better decomposition

---
 core_compression.py |  2 +-
 decomposition.py    | 59 ++++++++++++++++++++++++++++++++-------------
 gptq.py             |  2 +-
 lr_only.py          | 42 ++++++++++++++++++++++++++++++++
 matq.py             | 15 +++++++-----
 scripts/lr_quant.sh | 24 ++----------------
 6 files changed, 97 insertions(+), 47 deletions(-)

diff --git a/core_compression.py b/core_compression.py
index 7d6b1f8..93fb548 100644
--- a/core_compression.py
+++ b/core_compression.py
@@ -73,7 +73,7 @@ def forward(self, inp, **kwargs):
     quantizers = {}
     l_quantizers = {}
     lr_tensors = {}
-    # parallelize this to allocate to multiple GPUs
+    # parallelize this to allocate to multiple GPUs?
     for i in tqdm(range(len(delta_layers))):
         layer = delta_layers[i].to(device)
         original_layer = layers[i].to(device)
diff --git a/decomposition.py b/decomposition.py
index 535f684..a183566 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -1,5 +1,6 @@
-import torch
 import time
+from tqdm import tqdm
+import torch
 from loguru import logger
 
 def svd_decomposition(matrix, rank, niter=500):
@@ -34,10 +35,10 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
         W = matrix
         X = input_matrix
         for i in range(max_iterations):
-
-            gradient_L = -2 * (matrix@X - L @ (R @ X)) @ ((R@X).T)
+            diff = W@X - L @ R @ X
+            gradient_L = -2 * (diff @ ((R@X).T))
             # gradient_L = -2 * np.dot((WX - np.dot(U, np.dot(V, X))), np.dot(V, X).T)
-            gradient_R = -2 * (L.T @ (W@X - L @ (R @ X)))
+            gradient_R = -2 * (L.T @ diff @ X.T)
             # gradient_V = -2 * np.dot(U.T, (WX - np.dot(U, np.dot(V, X))))
             L -= learning_rate * gradient_L
             R -= learning_rate * gradient_R
@@ -50,13 +51,26 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
         logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}")
     return L, R
 
+def torch_autograd(W, X, rank, lr, steps):
+    L = torch.rand((W.shape[0], rank), device=W.device, requires_grad=True)
+    R = torch.rand((rank, W.shape[1]), device=W.device, requires_grad=True)
+    optimizer = torch.optim.SGD([L, R], lr=lr)
+    for _ in tqdm(range(steps)):
+        optimizer.zero_grad()
+        output = L @ R @ X
+        target = W @ X
+        loss = torch.nn.functional.mse_loss(output, target)
+        loss.backward()
+        optimizer.step()
+    return L, R
+
 if __name__=="__main__":
-    matrix = torch.rand((1024,1024))
-    # matrix = torch.tensor([[1., 2., 3., 4.],[5., 6., 7., 8.],[9., 10., 11., 12.],[13., 14., 15., 16.]])
-    input_matrix = torch.rand((1024,16))
+    #matrix = torch.rand((1024,1024))
+    matrix = torch.rand((16, 16))
+    input_matrix = torch.rand((16,2))
     
     print(matrix)
-    rank = 32
+    rank = 4
     print("Original output:")
     original_output = matrix @ input_matrix
     print(original_output)
@@ -64,20 +78,31 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
         matrix,
         rank,
         learning_rate=1e-6,
-        max_iterations=100000,
+        max_iterations=1000,
+        input_matrix=input_matrix
     )
-    # L_pca, R_pca = svd_decomposition(matrix, rank)
-    # reconstructed_matrix_pca = L_pca @ R_pca
-    print(L.shape)
-    print(R.shape)
+    L_noinput, R_noinput = low_rank_decomposition(
+        matrix,
+        rank,
+        learning_rate=1e-6,
+        max_iterations=1000,
+    )
+    L_autograd, R_autograd = torch_autograd(matrix, input_matrix, rank, 1e-6, 1000)
+    # # L_pca, R_pca = svd_decomposition(matrix, rank)
+    # # reconstructed_matrix_pca = L_pca @ R_pca
 
-    reconstructed_matrix = L @ R @ input_matrix
 
+    reconstructed_matrix = L @ R @ input_matrix
+    reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix
+    reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix
     print("Reconstructed output:")
     print(reconstructed_matrix)
-    # print("Reconstructed matrix (pca):")
-    # print(reconstructed_matrix_pca)
+    print("Reconstructed matrix (autograd):")
+    print(reconstructed_matrix_pca)
+    print("Reconstructed matrix (noinput):")
+    print(reconstructed_matrix_noinput)
 
     print("difference:")
     print(torch.norm(original_output - reconstructed_matrix))
-    #print(torch.norm(matrix - reconstructed_matrix_pca))
\ No newline at end of file
+    print(torch.norm(original_output - reconstructed_matrix_pca))
+    print(torch.norm(original_output - reconstructed_matrix_noinput))
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index 2477cac..8f719e1 100644
--- a/gptq.py
+++ b/gptq.py
@@ -152,4 +152,4 @@ def free(self):
         self.H = None
         self.Losses = None
         self.Trace = None
-        torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
\ No newline at end of file
diff --git a/lr_only.py b/lr_only.py
index e69de29..dc392c2 100644
--- a/lr_only.py
+++ b/lr_only.py
@@ -0,0 +1,42 @@
+import copy
+import torch
+import argparse
+import torch.nn as nn
+from loguru import logger
+from evaluation import opt_eval
+from datautils import get_loaders
+from core_compression import opt_delta_lr
+from modelutils import get_opt, find_layers
+from save_and_load import save_lr_tensors, load_lr_tensors
+
+@torch.no_grad()
+def lowrank_decomposition(model, rank, n_samples, data_loader=None):
+    lr_iopairs = {}
+    
+    def add_batch(name):
+        def temp(_, inp, out):
+            lr_iopairs[name] = (inp, out)
+        return temp
+    layers = model.model.decoder.layers
+    inps = torch.zeros(
+        (n_samples, model.seqlen, model.config.hidden_size), dtype=torch.fp16, device=model.device
+    )
+    handles = []
+    for i in range(len(layers)):
+        subset = find_layers(layers[i])
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(f"decoder.layers.{i}.{name}")))
+            layer_id = f"decoder.layers.{i}.{name}"
+            decomposing_layer = subset[name].weight
+            # decompose this into low rank matrices
+            
+if __name__=="__main__":
+    base_model = get_opt('facebook/opt-1.3b')
+    trainloader, loader_enc = get_loaders(
+        'wikitext2',
+        nsamples = 128,
+        seed=42,
+        model='facebook/opt-1.3b',
+        seqlen=base_model.seqlen,
+    )
+    lowrank_decomposition(base_model, 32, 128, trainloader)
\ No newline at end of file
diff --git a/matq.py b/matq.py
index 13dbbf5..f78aa35 100644
--- a/matq.py
+++ b/matq.py
@@ -22,19 +22,19 @@ def __init__(self, layer, rank=32):
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
         self.rank = rank
-        self.decompose()
+        # self.decompose()
         self.rows = W.shape[0]
         self.columns = W.shape[1]
-        self.L_columns = self.L.shape[1]
+        self.L_columns = rank
         self.H = torch.zeros((self.columns, self.columns), device=self.dev)
         self.H_R = torch.zeros((self.columns, self.columns), device=self.dev)
         self.H_L = torch.zeros((self.L_columns, self.L_columns), device=self.dev)
         self.nsamples = 0
 
     def add_batch_lr(self, inp, out):
-        if DEBUG:
-            self.inp1 = inp
-            self.out1 = out
+        #if DEBUG:
+        # self.inp1 = inp
+        # self.out1 = out
         if len(inp.shape) == 2:
             inp = inp.unsqueeze(0)
         tmp = inp.shape[0]
@@ -54,8 +54,9 @@ def add_batch_lr(self, inp, out):
             inp = inp.flatten(1)
         self.H_R *= self.nsamples / (self.nsamples + tmp)
         self.nsamples += tmp
-        inp = math.sqrt(2 / self.nsamples) * inp.float()
 
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        self.inp = inp
         self.H_R += inp.matmul(inp.t())
         # logger.info(f"self.H_R: {self.H_R.shape}")
         # for L, consider the input to be R@X
@@ -76,6 +77,7 @@ def free(self):
         torch.cuda.empty_cache()
 
     def decompose(self):
+        print(self.inp.shape)
         W = self.layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
             W = W.flatten(1)
@@ -93,6 +95,7 @@ def decompose(self):
         logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}")
 
     def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False):
+        self.decompose()
         self.lr_quant_R(blocksize, percdamp, groupsize, actorder)
         self.lr_quant_L(blocksize, percdamp, groupsize, actorder)
         # restored weight is L@R
diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh
index 6df8ed9..b716288 100644
--- a/scripts/lr_quant.sh
+++ b/scripts/lr_quant.sh
@@ -3,27 +3,7 @@ python cli.py \
     --target-model lnair/opt-1.3b-wikitext2 \
     --base-model facebook/opt-1.3b \
     --delta \
-    --rank 128 \
+    --rank 16 \
     --save outputs/ \
     --nsamples 128 \
-    --wbits 8
-
-python cli.py \
-    --dataset wikitext2 \
-    --target-model lnair/opt-1.3b-wikitext2 \
-    --base-model facebook/opt-1.3b \
-    --delta \
-    --rank 256 \
-    --save outputs/ \
-    --nsamples 128 \
-    --wbits 8
-
-python cli.py \
-    --dataset wikitext2 \
-    --target-model lnair/opt-1.3b-wikitext2 \
-    --base-model facebook/opt-1.3b \
-    --delta \
-    --rank 128 \
-    --save outputs/ \
-    --nsamples 128 \
-    --wbits 4
\ No newline at end of file
+    --wbits 8
\ No newline at end of file

From 31e72198bb4a87160c7d0208ba67a42536f94baa Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Wed, 10 May 2023 10:16:27 +0000
Subject: [PATCH 07/23] better decomposition

---
 cli.py                | 12 ++++++---
 core_compression.py   |  8 +++---
 decomposition.py      | 10 ++------
 matq.py               | 58 ++++++++++++++++---------------------------
 scripts/lr_quant.sh   | 11 ++++++++
 scripts/lr_quant_2.sh | 15 +++--------
 to_hf.py              |  2 +-
 7 files changed, 51 insertions(+), 65 deletions(-)

diff --git a/cli.py b/cli.py
index dece358..2159fb5 100644
--- a/cli.py
+++ b/cli.py
@@ -10,7 +10,7 @@
 from core_compression import opt_delta_lr
 
 @torch.no_grad()
-def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples):
+def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples, decompose_only=False):
     # first do low rank approximation
     # then quantize
     original_finetuned_model = copy.deepcopy(target_model)
@@ -29,7 +29,8 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
             'percdamp': 0.01,
             'groupsize': -1,
             'actorder': False,
-        }
+        },
+        decompose_only=decompose_only
     )
     
     target_model.to(base_model.device)
@@ -53,9 +54,11 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
     argparser.add_argument('--save', type=str, default='', help='Path to save the quantized model')
     argparser.add_argument('--wbits', type=int, default=8, help='Number of bits to use for quantization')
     argparser.add_argument('--sym', action='store_true', default=True, help='Whether to use symmetric quantization')
+    argparser.add_argument('--decompose-only', action='store_true', default=False, help='Whether to use quantization')
     argparser.add_argument('--trits', action='store_true', default=False, help='Whether to use trits')
 
     args = argparser.parse_args()
+    print(args)
     seed = args.seed
     
     base_model = get_opt(args.base_model)
@@ -77,10 +80,11 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
         trainloader,
         args.rank,
         args.wbits,
-        args.nsamples
+        args.nsamples,
+        args.decompose_only,
     )
     if args.save:
-        save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors")
+        save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-decompose.{args.decompose_only}-lr.safetensors")
 
     ppl = opt_eval(target_model, loader_enc, args, target_model.device)
     logger.info(f"Perplexity: {ppl}")
\ No newline at end of file
diff --git a/core_compression.py b/core_compression.py
index 93fb548..6e79ba4 100644
--- a/core_compression.py
+++ b/core_compression.py
@@ -16,7 +16,8 @@ def opt_delta_lr(
         sym,
         trits,
         rank,
-        args
+        args,
+        decompose_only=False,
     ):
     device = model.device
     print("Starting LR quantizer initialization...")
@@ -49,6 +50,7 @@ def forward(self, inp, **kwargs):
             cache['i'] += 1
             cache['attention_mask'] = kwargs['attention_mask']
             raise ValueError
+    
     layers[0] = Catcher(layers[0])
     for batch in dataloader:
         try:
@@ -64,7 +66,6 @@ def forward(self, inp, **kwargs):
     if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
         model.model.decoder.project_in = model.model.decoder.project_in.cpu()
     torch.cuda.empty_cache()
-
     outs = torch.zeros_like(inps)
     original_outs = torch.zeros_like(inps)
     attention_mask = cache['attention_mask']
@@ -81,7 +82,7 @@ def forward(self, inp, **kwargs):
         subset = find_layers(layer)
         lr_gptq = {}
         for name in subset:
-            lr_gptq[name] = TensorQ(subset[name], rank)
+            lr_gptq[name] = TensorQ(subset[name], rank, sensitive_decompose=True)
             lr_gptq[name].quantizer = Quantizer()
             lr_gptq[name].quantizer.configure(
                 wbits,
@@ -121,6 +122,7 @@ def temp(_, inp, out):
                 percdamp=args['percdamp'],
                 groupsize=args['groupsize'],
                 actorder=args['actorder'],
+                decompose_only=decompose_only,
             )
             lr_tensors[f'<R>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R
             lr_tensors[f'<L>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L
diff --git a/decomposition.py b/decomposition.py
index a183566..4099728 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -1,6 +1,6 @@
 import time
-from tqdm import tqdm
 import torch
+from tqdm import tqdm
 from loguru import logger
 
 def svd_decomposition(matrix, rank, niter=500):
@@ -22,7 +22,6 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
         for i in range(max_iterations):
             # Calculate the difference between the original and reconstructed matrices
             difference = matrix - L @ R
-
             # Calculate the gradients
             gradient_L = -2 * (difference @ R.T)
             gradient_R = -2 * (L.T @ difference)
@@ -37,18 +36,13 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500,
         for i in range(max_iterations):
             diff = W@X - L @ R @ X
             gradient_L = -2 * (diff @ ((R@X).T))
-            # gradient_L = -2 * np.dot((WX - np.dot(U, np.dot(V, X))), np.dot(V, X).T)
             gradient_R = -2 * (L.T @ diff @ X.T)
-            # gradient_V = -2 * np.dot(U.T, (WX - np.dot(U, np.dot(V, X))))
             L -= learning_rate * gradient_L
             R -= learning_rate * gradient_R
             if torch.norm(W@X - L @ (R @ X)) < tolerance:
                 early_stop = True
                 break
-    if not early_stop:
-        logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}")
-    else:
-        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}")
+    logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {torch.norm(W@X - L @ (R @ X))}")
     return L, R
 
 def torch_autograd(W, X, rank, lr, steps):
diff --git a/matq.py b/matq.py
index f78aa35..9a40374 100644
--- a/matq.py
+++ b/matq.py
@@ -13,7 +13,7 @@
 torch.backends.cudnn.allow_tf32 = False
 
 class TensorQ:
-    def __init__(self, layer, rank=32):
+    def __init__(self, layer, rank=32, sensitive_decompose=False):
         self.layer = layer
         self.dev = self.layer.weight.device
         W = layer.weight.data.clone()
@@ -22,7 +22,8 @@ def __init__(self, layer, rank=32):
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
         self.rank = rank
-        # self.decompose()
+        if not sensitive_decompose:
+            self.decompose()
         self.rows = W.shape[0]
         self.columns = W.shape[1]
         self.L_columns = rank
@@ -37,33 +38,24 @@ def add_batch_lr(self, inp, out):
         # self.out1 = out
         if len(inp.shape) == 2:
             inp = inp.unsqueeze(0)
-        tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+        self.tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear):
             if len(inp.shape) == 3:
                 inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
-        if isinstance(self.layer, nn.Conv2d):
-            unfold = nn.Unfold(
-                self.layer.kernel_size,
-                dilation=self.layer.dilation,
-                padding=self.layer.padding,
-                stride=self.layer.stride
-            )
-            inp = unfold(inp)
-            inp = inp.permute([1, 0, 2])
-            inp = inp.flatten(1)
-        self.H_R *= self.nsamples / (self.nsamples + tmp)
-        self.nsamples += tmp
-
+        
+        self.H_R *= self.nsamples / (self.nsamples + self.tmp)
+        self.nsamples += self.tmp
         inp = math.sqrt(2 / self.nsamples) * inp.float()
         self.inp = inp
-        self.H_R += inp.matmul(inp.t())
+
+    def calculate_hessian(self):
+        self.H_R += self.inp.matmul(self.inp.t())
         # logger.info(f"self.H_R: {self.H_R.shape}")
         # for L, consider the input to be R@X
-        inp = self.R @ inp
-        self.H_L *= self.nsamples / (self.nsamples + tmp)
-        self.H_L += inp.matmul(inp.t())
-        # logger.info(f"self.H_L: {self.H_L.shape}")
+        l_inp = self.R @ self.inp
+        self.H_L *= self.nsamples / (self.nsamples + self.tmp)
+        self.H_L += l_inp.matmul(l_inp.t())
 
     def free(self):
         if DEBUG:
@@ -77,28 +69,20 @@ def free(self):
         torch.cuda.empty_cache()
 
     def decompose(self):
-        print(self.inp.shape)
         W = self.layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-        if isinstance(self.layer, transformers.Conv1D):
-            W = W.t()
         W = W.float()
         logger.info("starting decomposition")
         tick = time.time()
-        # U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5)
-        # # let's say L = U
-        # # and R = diag(S)*V.T
-        # self.L = U
-        # self.R = torch.diag_embed(S) @ Vh.T
-        self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=100000)
+        self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=10000, input_matrix=self.inp)
         logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}")
 
-    def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False):
+    def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, decompose_only=False):
         self.decompose()
-        self.lr_quant_R(blocksize, percdamp, groupsize, actorder)
-        self.lr_quant_L(blocksize, percdamp, groupsize, actorder)
-        # restored weight is L@R
+        if not decompose_only:
+            self.calculate_hessian()
+            self.lr_quant_R(blocksize, percdamp, groupsize, actorder)
+            self.lr_quant_L(blocksize, percdamp, groupsize, actorder)
+        # restored weight is L@R, we overwrite the weight for evaluation if needed
         # but on disk we only save L, R
         self.layer.weight.data = (self.L @ self.R).reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
         
diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh
index b716288..2665e7b 100644
--- a/scripts/lr_quant.sh
+++ b/scripts/lr_quant.sh
@@ -6,4 +6,15 @@ python cli.py \
     --rank 16 \
     --save outputs/ \
     --nsamples 128 \
+    --wbits 8
+
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-1.3b-wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --delta \
+    --rank 16 \
+    --save outputs/ \
+    --nsamples 128 \
+    --decompose-only \
     --wbits 8
\ No newline at end of file
diff --git a/scripts/lr_quant_2.sh b/scripts/lr_quant_2.sh
index 19865ec..2250e30 100644
--- a/scripts/lr_quant_2.sh
+++ b/scripts/lr_quant_2.sh
@@ -3,9 +3,10 @@ python cli.py \
     --target-model lnair/opt-1.3b-wikitext2 \
     --base-model facebook/opt-1.3b \
     --delta \
-    --rank 512 \
+    --rank 32 \
     --save outputs/ \
     --nsamples 128 \
+    --decompose-only \
     --wbits 8
 
 python cli.py \
@@ -13,17 +14,7 @@ python cli.py \
     --target-model lnair/opt-1.3b-wikitext2 \
     --base-model facebook/opt-1.3b \
     --delta \
-    --rank 512 \
-    --save outputs/ \
-    --nsamples 128 \
-    --wbits 4
-
-python cli.py \
-    --dataset wikitext2 \
-    --target-model lnair/opt-1.3b-wikitext2 \
-    --base-model facebook/opt-1.3b \
-    --delta \
-    --rank 1024 \
+    --rank 32 \
     --save outputs/ \
     --nsamples 128 \
     --wbits 8
\ No newline at end of file
diff --git a/to_hf.py b/to_hf.py
index 155b349..2deb4a5 100644
--- a/to_hf.py
+++ b/to_hf.py
@@ -11,7 +11,7 @@
 from modelutils import find_layers
 from copy import deepcopy
 target_model = deepcopy(base_model)
-MODEL_ID = "lnair.opt-1.3b-wikitext2-r128-w8-lr"
+MODEL_ID = "lnair.opt-1.3b-wikitext2-r256-w8-lr"
 tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors")
 
 target_layers = target_model.model.decoder.layers

From 22507fba91a67b6170d27cfc576643b61f222b62 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Wed, 10 May 2023 12:03:40 +0000
Subject: [PATCH 08/23] mse metric

---
 decomposition.py | 81 +++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 46 deletions(-)

diff --git a/decomposition.py b/decomposition.py
index 4099728..8d925be 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -2,47 +2,45 @@
 import torch
 from tqdm import tqdm
 from loguru import logger
+import torch.nn.functional as F
 
-def svd_decomposition(matrix, rank, niter=500):
+def svd_decomposition(matrix, rank):
     U, S, Vh = torch.pca_lowrank(matrix, q=rank)
     return U @ torch.diag_embed(S),  Vh.T
 
-def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, input_matrix = None):
-    # Initialize random matrices U and V
-    # m, n = matrix.shape
-    # let's choose a good start point?
-    # L, R = pca_decomposition(matrix, rank)
-    # random seems to work better generally
-    L = torch.rand((matrix.shape[0], rank), device=matrix.device)
-    R = torch.rand((rank, matrix.shape[1]), device=matrix.device)
-
+def low_rank_decomposition(W, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, X = None):
+    L = torch.rand((W.shape[0], rank), device=W.device)
+    R = torch.rand((rank, W.shape[1]), device=W.device)
     tick = time.time()
     early_stop = False
-    if input_matrix is None:
+    if X is None:
         for i in range(max_iterations):
             # Calculate the difference between the original and reconstructed matrices
-            difference = matrix - L @ R
+            diff_part1 = W
+            diff_part2 = L @ R
+            difference = W - L @ R
             # Calculate the gradients
             gradient_L = -2 * (difference @ R.T)
             gradient_R = -2 * (L.T @ difference)
             L -= learning_rate * gradient_L
             R -= learning_rate * gradient_R
-            if torch.norm(difference) < tolerance:
+            if F.mse_loss(diff_part1, diff_part2) < tolerance:
                 early_stop = True
                 break
+        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}")
     else:
-        W = matrix
-        X = input_matrix
         for i in range(max_iterations):
-            diff = W@X - L @ R @ X
+            diff_part1 = W@X
+            diff_part2 = L @ R @ X
+            diff = diff_part1 - diff_part2
             gradient_L = -2 * (diff @ ((R@X).T))
             gradient_R = -2 * (L.T @ diff @ X.T)
             L -= learning_rate * gradient_L
             R -= learning_rate * gradient_R
-            if torch.norm(W@X - L @ (R @ X)) < tolerance:
+            if F.mse_loss(diff_part1, diff_part2) < tolerance:
                 early_stop = True
                 break
-    logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {torch.norm(W@X - L @ (R @ X))}")
+        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}")
     return L, R
 
 def torch_autograd(W, X, rank, lr, steps):
@@ -59,44 +57,35 @@ def torch_autograd(W, X, rank, lr, steps):
     return L, R
 
 if __name__=="__main__":
-    #matrix = torch.rand((1024,1024))
-    matrix = torch.rand((16, 16))
-    input_matrix = torch.rand((16,2))
+    FULL_RANK = 128
+    LOW_RANK = 16
+    TARGET_SIZE = 2
+
+    W = torch.rand((FULL_RANK, FULL_RANK))
+    input_matrix = torch.rand((FULL_RANK, TARGET_SIZE))
+    output_matrix = W @ input_matrix
     
-    print(matrix)
-    rank = 4
     print("Original output:")
-    original_output = matrix @ input_matrix
-    print(original_output)
-    L, R = low_rank_decomposition(
-        matrix,
-        rank,
+    print(output_matrix)
+    L_sensitive, R_sensitive = low_rank_decomposition(
+        W,
+        LOW_RANK,
         learning_rate=1e-6,
         max_iterations=1000,
-        input_matrix=input_matrix
+        X=input_matrix
     )
     L_noinput, R_noinput = low_rank_decomposition(
-        matrix,
-        rank,
+        W,
+        LOW_RANK,
         learning_rate=1e-6,
         max_iterations=1000,
     )
-    L_autograd, R_autograd = torch_autograd(matrix, input_matrix, rank, 1e-6, 1000)
-    # # L_pca, R_pca = svd_decomposition(matrix, rank)
-    # # reconstructed_matrix_pca = L_pca @ R_pca
+    L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-6, 1000)
 
-
-    reconstructed_matrix = L @ R @ input_matrix
+    reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix
     reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix
     reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix
-    print("Reconstructed output:")
-    print(reconstructed_matrix)
-    print("Reconstructed matrix (autograd):")
-    print(reconstructed_matrix_pca)
-    print("Reconstructed matrix (noinput):")
-    print(reconstructed_matrix_noinput)
-
     print("difference:")
-    print(torch.norm(original_output - reconstructed_matrix))
-    print(torch.norm(original_output - reconstructed_matrix_pca))
-    print(torch.norm(original_output - reconstructed_matrix_noinput))
\ No newline at end of file
+    print(f"gd: {F.mse_loss(output_matrix, reconstructed_matrix)}")
+    print(f"autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}")
+    print(f"noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}")

From 005cb659edf1b086cb9267110245bcca6e9c914c Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Wed, 10 May 2023 14:30:36 +0000
Subject: [PATCH 09/23] more complex scheduler...

---
 decomposition.py         | 67 ++++++++++++++++++++--------------------
 matq.py                  |  4 +--
 scripts/lr_quant_350m.sh | 10 ++++++
 to_hf.py                 |  2 +-
 4 files changed, 47 insertions(+), 36 deletions(-)
 create mode 100644 scripts/lr_quant_350m.sh

diff --git a/decomposition.py b/decomposition.py
index 8d925be..5663594 100644
--- a/decomposition.py
+++ b/decomposition.py
@@ -2,6 +2,7 @@
 import torch
 from tqdm import tqdm
 from loguru import logger
+from torch.optim.lr_scheduler import ExponentialLR
 import torch.nn.functional as F
 
 def svd_decomposition(matrix, rank):
@@ -14,78 +15,78 @@ def low_rank_decomposition(W, rank, learning_rate=0.01, max_iterations=500, tole
     tick = time.time()
     early_stop = False
     if X is None:
-        for i in range(max_iterations):
-            # Calculate the difference between the original and reconstructed matrices
-            diff_part1 = W
-            diff_part2 = L @ R
+        for i in tqdm(range(max_iterations)):
             difference = W - L @ R
-            # Calculate the gradients
             gradient_L = -2 * (difference @ R.T)
             gradient_R = -2 * (L.T @ difference)
             L -= learning_rate * gradient_L
             R -= learning_rate * gradient_R
-            if F.mse_loss(diff_part1, diff_part2) < tolerance:
+            if F.mse_loss(W, L@R) < tolerance:
                 early_stop = True
                 break
-        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}")
+        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W, L@R)}")
     else:
-        for i in range(max_iterations):
-            diff_part1 = W@X
-            diff_part2 = L @ R @ X
-            diff = diff_part1 - diff_part2
+        for i in tqdm(range(max_iterations)):
+            diff = W @ X - L @ R @ X
             gradient_L = -2 * (diff @ ((R@X).T))
             gradient_R = -2 * (L.T @ diff @ X.T)
             L -= learning_rate * gradient_L
             R -= learning_rate * gradient_R
-            if F.mse_loss(diff_part1, diff_part2) < tolerance:
+            if F.mse_loss(W @ X, L @ R @ X) < tolerance:
                 early_stop = True
                 break
-        logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}")
+            # print(F.mse_loss(W @ X, L @ R @ X))
+        logger.info(f"[With Input] Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W@X, L@R@X)}")
     return L, R
 
 def torch_autograd(W, X, rank, lr, steps):
     L = torch.rand((W.shape[0], rank), device=W.device, requires_grad=True)
     R = torch.rand((rank, W.shape[1]), device=W.device, requires_grad=True)
-    optimizer = torch.optim.SGD([L, R], lr=lr)
-    for _ in tqdm(range(steps)):
+    optimizer = torch.optim.SGD([L, R], lr=lr, momentum=0.9)
+    scheduler = ExponentialLR(optimizer, gamma=0.9)
+    for j in tqdm(range(steps)):
         optimizer.zero_grad()
         output = L @ R @ X
         target = W @ X
         loss = torch.nn.functional.mse_loss(output, target)
         loss.backward()
         optimizer.step()
+        if j % 200 == 0:
+            scheduler.step()
     return L, R
 
 if __name__=="__main__":
-    FULL_RANK = 128
-    LOW_RANK = 16
+    FULL_RANK = 2048
+    FULL_RANK_H = 1024
+    FULL_RANK_W = 4096
+    LOW_RANK = 32
     TARGET_SIZE = 2
 
-    W = torch.rand((FULL_RANK, FULL_RANK))
-    input_matrix = torch.rand((FULL_RANK, TARGET_SIZE))
+    W = torch.rand((FULL_RANK_W, FULL_RANK_H))
+    input_matrix = torch.rand((FULL_RANK_H, TARGET_SIZE))
     output_matrix = W @ input_matrix
     
-    print("Original output:")
-    print(output_matrix)
     L_sensitive, R_sensitive = low_rank_decomposition(
         W,
         LOW_RANK,
-        learning_rate=1e-6,
-        max_iterations=1000,
+        learning_rate=1e-9,
+        max_iterations=2000,
         X=input_matrix
     )
+    reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix
+    print(f"reconstructed mse: gd: {F.mse_loss(output_matrix, reconstructed_matrix)}")
+
     L_noinput, R_noinput = low_rank_decomposition(
         W,
         LOW_RANK,
-        learning_rate=1e-6,
-        max_iterations=1000,
+        learning_rate=1e-9,
+        max_iterations=2000,
     )
-    L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-6, 1000)
-
-    reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix
-    reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix
     reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix
-    print("difference:")
-    print(f"gd: {F.mse_loss(output_matrix, reconstructed_matrix)}")
-    print(f"autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}")
-    print(f"noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}")
+    print(f"reconstructed mse: gd. noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}")
+    
+    L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-9, 2000)
+    reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix
+    print(f"reconstructed mse: autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}")
+    
+    
diff --git a/matq.py b/matq.py
index 9a40374..11d80ac 100644
--- a/matq.py
+++ b/matq.py
@@ -7,7 +7,7 @@
 from quant import quantize
 from decomposition import low_rank_decomposition
 
-DEBUG = False 
+DEBUG = False
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
@@ -73,7 +73,7 @@ def decompose(self):
         W = W.float()
         logger.info("starting decomposition")
         tick = time.time()
-        self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=10000, input_matrix=self.inp)
+        self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-10, max_iterations=5000, X=self.inp)
         logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}")
 
     def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, decompose_only=False):
diff --git a/scripts/lr_quant_350m.sh b/scripts/lr_quant_350m.sh
new file mode 100644
index 0000000..4710196
--- /dev/null
+++ b/scripts/lr_quant_350m.sh
@@ -0,0 +1,10 @@
+python cli.py \
+    --dataset wikitext2 \
+    --target-model lnair/opt-350m-wikitext2 \
+    --base-model facebook/opt-350m \
+    --delta \
+    --rank 32 \
+    --save outputs/ \
+    --nsamples 128 \
+    --decompose-only \
+    --wbits 8
diff --git a/to_hf.py b/to_hf.py
index 2deb4a5..f450de4 100644
--- a/to_hf.py
+++ b/to_hf.py
@@ -11,7 +11,7 @@
 from modelutils import find_layers
 from copy import deepcopy
 target_model = deepcopy(base_model)
-MODEL_ID = "lnair.opt-1.3b-wikitext2-r256-w8-lr"
+MODEL_ID = "lnair.opt-1.3b-wikitext2-r32-w8-decompose.True-lr"
 tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors")
 
 target_layers = target_model.model.decoder.layers

From 6440eb03aa6445605d80b6664fc488e88f7da15b Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 11 May 2023 00:42:48 +0000
Subject: [PATCH 10/23] add queuing jobs helper

---
 cli.py                |   2 +-
 opt_delta.py          |  18 +-
 opt_delta_fork.py     | 662 ------------------------------------------
 scripts/gptq_delta.sh |  68 +++++
 submit.py             |  29 ++
 5 files changed, 112 insertions(+), 667 deletions(-)
 delete mode 100644 opt_delta_fork.py
 create mode 100644 scripts/gptq_delta.sh
 create mode 100644 submit.py

diff --git a/cli.py b/cli.py
index eb1850d..dece358 100644
--- a/cli.py
+++ b/cli.py
@@ -81,6 +81,6 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s
     )
     if args.save:
         save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors")
-    
+
     ppl = opt_eval(target_model, loader_enc, args, target_model.device)
     logger.info(f"Perplexity: {ppl}")
\ No newline at end of file
diff --git a/opt_delta.py b/opt_delta.py
index f9f6bbf..bb55f7f 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -7,7 +7,7 @@
 from gptq import *
 from modelutils import *
 from quant import *
-
+from transformers import AutoTokenizer, AutoModel
 import copy
 #from prettytable import PrettyTable
 
@@ -425,7 +425,6 @@ def forward(self, *inp, **kwargs):
 def benchmark(model, input_ids, check=False):
     input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
     torch.cuda.synchronize()
-
     cache = {'past': None}
     def clear_past(i):
         def tmp(layer, inp, out):
@@ -545,11 +544,18 @@ def main(args):
     if args.rank > 0:
         print("Number of params without low rank ", num_params)
         print("Number of params with low rank", num_params - num_params_saved_lr)
-    if args.save:
+    if args.save_hf:
+        if args.delta:
+            hf_path = f"outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz"
+        else:
+            hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits"
+        model.save_pretrained(hf_path)
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        tokenizer.save_pretrained(hf_path)
+    else:
         opt_pack3(model, quantizers)
         torch.save(model.state_dict(), args.save) 
 
-
 if __name__ == '__main__':
     import argparse
     from datautils import *
@@ -636,6 +642,10 @@ def main(args):
         '--sparsify_hard_threshold', action='store_true',
         help='Whether to add sparsity'
     )
+    parser.add_argument(
+        '--save-hf', action='store_true', default=False,
+        help='Whether to save a huggingface model'
+    )
     parser.add_argument(
         '--fraction_of_zero', type=float, default=0.99,
         help='Sparsity ratio'
diff --git a/opt_delta_fork.py b/opt_delta_fork.py
deleted file mode 100644
index 9a7710c..0000000
--- a/opt_delta_fork.py
+++ /dev/null
@@ -1,662 +0,0 @@
-import time
-
-import torch
-import torch.nn as nn
-
-from gptq import *
-from modelutils import *
-from quant import *
-import json
-import pickle
-import copy
-#from prettytable import PrettyTable
-
-def get_opt(model):
-    import torch
-    def skip(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = skip
-    torch.nn.init.uniform_ = skip
-    torch.nn.init.normal_ = skip
-    from transformers import OPTForCausalLM
-    # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
-    model.seqlen = model.config.max_position_embeddings
-    return model
-
-def hard_threshold(x, fraction_of_zero=0.1):
-    y, _ = torch.sort(x.view(-1).abs().clone())
-    num_params = torch.numel(x)
-    thresh_index = int(num_params * fraction_of_zero)
-    threshold = y[thresh_index]
-    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
-    return mask * x
-
-@torch.no_grad()
-def opt_sequential_delta(model, delta_model, dataloader, dev):
-    print('Starting ...')
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.decoder.layers
-    delta_layers = delta_model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for batch in dataloader:
-        try:
-            model(batch[0].to(dev))
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    original_outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-
-    print('Ready.')
-
-    quantizers = {}
-    for i in range(len(delta_layers)):
-        layer = delta_layers[i].to(dev)
-        original_layer = layers[i].to(dev)
-
-        subset = find_layers(layer)
-        gptq = {}
-        for name in subset:
-            gptq[name] = GPTQ(subset[name])
-            gptq[name].quantizer = Quantizer()
-            gptq[name].quantizer.configure(
-                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
-            )
-
-        def add_batch(name):
-            def tmp(_, inp, out):
-                gptq[name].add_batch(inp[0].data, out.data)
-            return tmp
-        handles = []
-        for name in subset:
-            handles.append(subset[name].register_forward_hook(add_batch(name)))
-        
-        for j in range(args.nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-            
-            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        for h in handles:
-            h.remove()
-
-        for name in subset:
-            print(i, name)
-            print('Quantizing ...')
-            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
-            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
-            gptq[name].free()
-        for j in range(args.nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-
-        layers[i] = layer.cpu()
-        del layer
-        del gptq 
-        torch.cuda.empty_cache()
-
-        inps, outs = original_outs, inps
-
-    model.config.use_cache = use_cache
-
-    return quantizers
-
-@torch.no_grad()
-def opt_sequential(model, dataloader, dev):
-    print('Starting ...')
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for batch in dataloader:
-        try:
-            model(batch[0].to(dev))
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-
-    print('Ready.')
-
-    quantizers = {}
-    for i in range(len(layers)):
-        layer = layers[i].to(dev)
-
-        subset = find_layers(layer)
-        gptq = {}
-        for name in subset:
-            gptq[name] = GPTQ(subset[name])
-            gptq[name].quantizer = Quantizer()
-            gptq[name].quantizer.configure(
-                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
-            )
-
-        def add_batch(name):
-            def tmp(_, inp, out):
-                gptq[name].add_batch(inp[0].data, out.data)
-            return tmp
-        handles = []
-        for name in subset:
-            handles.append(subset[name].register_forward_hook(add_batch(name)))
-        for j in range(args.nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        for h in handles:
-            h.remove()
-
-        for name in subset:
-            print(i, name)
-            print('Quantizing ...')
-            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
-            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
-            gptq[name].free()
-        for j in range(args.nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-
-        layers[i] = layer.cpu()
-        del layer
-        del gptq 
-        torch.cuda.empty_cache()
-
-        inps, outs = outs, inps
-
-    model.config.use_cache = use_cache
-    
-    return quantizers
-
-
-@torch.no_grad()
-def opt_eval(model, testenc, dev):
-    print('Evaluating ...')
-
-    testenc = testenc.input_ids
-    nsamples = testenc.numel() // model.seqlen
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for i in range(nsamples):
-        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
-        try:
-            model(batch)
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-
-    for i in range(len(layers)):
-        # print(i)
-        layer = layers[i].to(dev)
-
-        if args.nearest:
-            subset = find_layers(layer)
-            for name in subset:
-                quantizer = Quantizer()
-                quantizer.configure(
-                    args.wbits, perchannel=True, sym=args.sym, mse=False
-                )
-                W = subset[name].weight.data
-                quantizer.find_params(W, weight=True)
-                subset[name].weight.data = quantize(
-                    W, quantizer.scale, quantizer.zero, quantizer.maxq
-                ).to(next(iter(layer.parameters())).dtype)
-
-        for j in range(nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        layers[i] = layer.cpu()
-        del layer
-        torch.cuda.empty_cache()
-        inps, outs = outs, inps
-
-    if model.model.decoder.final_layer_norm is not None:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
-    if model.model.decoder.project_out is not None:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
-    model.lm_head = model.lm_head.to(dev)
-
-    testenc = testenc.to(dev)
-    nlls = []
-    for i in range(nsamples):
-        hidden_states = inps[i].unsqueeze(0)
-        if model.model.decoder.final_layer_norm is not None:
-            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
-        if model.model.decoder.project_out is not None:
-            hidden_states = model.model.decoder.project_out(hidden_states)
-        lm_logits = model.lm_head(hidden_states)
-        shift_logits = lm_logits[:, :-1, :].contiguous()
-        shift_labels = testenc[
-            :, (i * model.seqlen):((i + 1) * model.seqlen)
-        ][:, 1:]
-        loss_fct = nn.CrossEntropyLoss()
-        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-        neg_log_likelihood = loss.float() * model.seqlen
-        nlls.append(neg_log_likelihood)
-    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
-    print(ppl.item())
-
-    model.config.use_cache = use_cache
-    return ppl.item()
-
-# TODO: perform packing on GPU
-def opt_pack3(model, quantizers):
-    layers = find_layers(model)
-    layers = {n: layers[n] for n in quantizers}
-    make_quant3(model, quantizers, faster=args.faster_kernel)
-    qlayers = find_layers(model, [Quant3Linear])
-    print('Packing ...')
-    for name in qlayers:
-        print(name)
-        quantizers[name] = quantizers[name].cpu()
-        qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
-    print('Done.')
-    return model
-
-def load_quant3(model, checkpoint):
-    from transformers import OPTConfig, OPTForCausalLM 
-    config = OPTConfig.from_pretrained(model)
-    def noop(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = noop 
-    torch.nn.init.uniform_ = noop 
-    torch.nn.init.normal_ = noop 
-
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = OPTForCausalLM(config)
-    torch.set_default_dtype(torch.float)
-    model = model.eval()
-    layers = find_layers(model)
-    for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
-        if name in layers:
-            del layers[name]
-    make_quant3(model, layers, faster=args.faster_kernel)
-
-    print('Loading model ...')
-    model.load_state_dict(torch.load(checkpoint))
-    model.seqlen = model.config.max_position_embeddings
-    print('Done.')
-
-    return model
-
-def opt_multigpu(model, gpus):
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1])
-    if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1])
-    import copy
-    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
-
-    cache = {'mask': None}
-
-    class MoveModule(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-            self.dev = next(iter(self.module.parameters())).device
-        def forward(self, *inp, **kwargs):
-            inp = list(inp)
-            if inp[0].device != self.dev:
-                inp[0] = inp[0].to(self.dev)
-            if cache['mask'] is None or cache['mask'].device != self.dev:
-                cache['mask'] = kwargs['attention_mask'].to(self.dev)
-            kwargs['attention_mask'] = cache['mask']
-            tmp = self.module(*inp, **kwargs)
-            return tmp
-
-    layers = model.model.decoder.layers
-    pergpu = math.ceil(len(layers) / len(gpus))
-    for i in range(len(layers)):
-        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
-
-    model.gpus = gpus
-
-def benchmark(model, input_ids, check=False):
-    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
-    torch.cuda.synchronize()
-
-    cache = {'past': None}
-    def clear_past(i):
-        def tmp(layer, inp, out):
-            if cache['past']:
-                cache['past'][i] = None
-        return tmp
-    for i, layer in enumerate(model.model.decoder.layers):
-        layer.register_forward_hook(clear_past(i))
-
-    print('Benchmarking ...')
-
-    if check:
-        loss = nn.CrossEntropyLoss()
-        tot = 0.
-
-    def sync():
-        if hasattr(model, 'gpus'):
-            for gpu in model.gpus:
-                torch.cuda.synchronize(gpu)
-        else:
-            torch.cuda.synchronize()
-    with torch.no_grad():
-        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
-        times = []
-        for i in range(input_ids.numel()):
-            tick = time.time()
-            out = model(
-                input_ids[:, i].reshape(-1),
-                past_key_values=cache['past'],
-                attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
-            )
-            sync()
-            times.append(time.time() - tick)
-            print(i, times[-1])
-            if check and i != input_ids.numel() - 1:
-                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
-            cache['past'] = list(out.past_key_values)
-            del out
-        sync()
-        import numpy as np
-        print('Median:', np.median(times))
-        if check:
-            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
-
-
-def main(args):
-    print(args)
-    num_params_saved_lr = 0
-    num_params = 0
-    if args.load:
-        model = load_quant3(args.model, args.load)
-    else:
-        if args.delta and args.wbits<16:
-            model = get_opt(args.model)
-            model.eval()
-            base_model = get_opt(args.base_model)
-            base_model.eval()
-            dataloader, testloader = get_loaders(
-        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
-    )
-            original_finetuned_model = copy.deepcopy(model)
-            for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
-                finetuned_p.data = (finetuned_p.data-base_p.data).clone()
-        else:
-            model = get_opt(args.model)
-            model.eval()
-
-    dataloader, testloader = get_loaders(
-        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
-    )
-
-    if args.wbits < 16 and not args.nearest:
-        if args.delta:
-            tick = time.time()
-            quantizers = opt_sequential_delta(original_finetuned_model, model, dataloader, DEV)
-
-            comp_time = time.time()-tick
-        else:
-            quantizers = opt_sequential(model, dataloader, DEV)
-    
-    if args.delta and args.wbits<16:
-        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
-            # don't hard threshold for now
-            # if args.sparsify_hard_threshold:
-            #     print('Hard Thresholding...')
-            #     W = finetuned_p.data
-            #     finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
-            if args.rank>0 and len(finetuned_p.shape) == 2:
-                print('Finding Low Rank Approximation...')
-                A = finetuned_p.data.float()
-                U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
-                # let's say L = U
-                # and R = diag(S)*V.T
-                L = U
-                R = torch.diag_embed(S) @ Vh.T
-                # now quantize R
-                
-                A  = L @ R
-            
-                finetuned_p.data =  A.half()
-                num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
-            num_params += torch.numel(finetuned_p.data)
-            finetuned_p.data = (base_p.data + finetuned_p.data).clone()
-
-    if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            opt_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
-        if args.benchmark:
-            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
-            benchmark(model, input_ids, check=args.check)
-    if args.load:
-        exit()
-
-    dataset = args.dataset 
-    dataloader, testloader = get_loaders(
-        dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
-    )
-    
-    ppl = opt_eval(model, testloader, DEV)
-    print(ppl)
-
-    if args.rank > 0:
-        print("Number of params without low rank ", num_params)
-        print("Number of params with low rank", num_params - num_params_saved_lr)
-    if args.save:
-        opt_pack3(model, quantizers)
-        torch.save(model.state_dict(), args.save) 
-
-
-if __name__ == '__main__':
-    import argparse
-    from datautils import *
-
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        '--model', type=str, default='lnair/opt-1.3b-wikitext2',
-        help='OPT model to load; pass `facebook/opt-X`.'
-    )
-    parser.add_argument(
-        '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], default='wikitext2',
-        help='Where to extract calibration data from.'
-    )
-    parser.add_argument(
-        '--base-model', type=str, default='facebook/opt-1.3b',
-        help='base OPT model to load'
-    )
-    parser.add_argument(
-        '--seed',
-        type=int, default=0, help='Seed for sampling the calibration data.'
-    )
-    parser.add_argument(
-        '--nsamples', type=int, default=128,
-        help='Number of calibration data samples.'
-    )
-    parser.add_argument(
-        '--percdamp', type=float, default=.01,
-        help='Percent of the average Hessian diagonal to use for dampening.'
-    )
-    parser.add_argument(
-        '--nearest', action='store_true',
-        help='Whether to run the RTN baseline.'
-    ) 
-    parser.add_argument(
-        '--wbits', type=int, default=2, choices=[2, 3, 4, 16],
-        help='#bits to use for quantization; use 16 for evaluating base model.'
-    )
-    parser.add_argument(
-        '--trits', action='store_true',
-        help='Whether to use trits for quantization.'
-    )
-    parser.add_argument(
-        '--groupsize', type=int, default=-1,
-        help='Groupsize to use for quantization; default uses full row.'
-    )
-    parser.add_argument(
-        '--sym', action='store_true',
-        help='Whether to perform symmetric quantization.'
-    )
-    parser.add_argument(
-        '--save', type=str, default='',
-        help='Save quantized checkpoint under this name.'
-    )
-    parser.add_argument(
-        '--load', type=str, default='',
-        help='Load quantized model.'
-    )
-    parser.add_argument(
-        '--benchmark', type=int, default=0,
-        help='Number of tokens to use for benchmarking.'
-    )
-    parser.add_argument(
-        '--check', action='store_true',
-        help='Whether to compute perplexity during benchmarking for verification.'
-    )
-    parser.add_argument(
-        '--new-eval', action='store_true',
-        help='Whether to use the new PTB and C4 eval.'
-    )
-    parser.add_argument(
-        '--faster-kernel', action='store_true',
-        help='Whether to use the new faster kernel for benchmarking.'
-    )
-    parser.add_argument(
-        '--act-order', action='store_true',
-        help='Whether to apply the activation order GPTQ heuristic'
-    )
-    parser.add_argument(
-        '--delta', action='store_true',
-        help='Whether to use delta compression'
-    )
-    parser.add_argument(
-        '--sparsify_hard_threshold', action='store_true',
-        help='Whether to add sparsity'
-    )
-    parser.add_argument(
-        '--fraction_of_zero', type=float, default=0.99,
-        help='Sparsity ratio'
-    )
-
-    parser.add_argument(
-        '--rank', type=int, default=0,
-        help='The rank to use for decomposing each matrices'
-    )
-    args = parser.parse_args()
-
-    #results = PrettyTable()
-
-    main(args)
-    
-    print('finished.')
diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh
new file mode 100644
index 0000000..e73d717
--- /dev/null
+++ b/scripts/gptq_delta.sh
@@ -0,0 +1,68 @@
+ts -S 8
+CUDA_VISIBLE_DEVICES=0 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 2 \
+    --delta \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.95 \
+    --save-hf \
+    --groupsize 1024 &
+
+CUDA_VISIBLE_DEVICES=1 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 3 \
+    --delta \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.95 \
+    --save-hf \
+    --groupsize 1024 &
+
+CUDA_VISIBLE_DEVICES=2 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 4 \
+    --delta \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.95 \
+    --save-hf \
+    --groupsize 1024 &
+
+CUDA_VISIBLE_DEVICES=3 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 2 \
+    --delta \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 \
+    --save-hf \
+    --groupsize 1024 &
+
+CUDA_VISIBLE_DEVICES=4 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 3 \
+    --delta \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 \
+    --save-hf \
+    --groupsize 1024 &
+
+CUDA_VISIBLE_DEVICES=5 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 4 \
+    --delta \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 \
+    --save-hf \
+    --groupsize 1024 &
+
+CUDA_VISIBLE_DEVICES=6 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 3 \
+    --delta \
+    --save-hf \
+    --groupsize 1024 &
+
+CUDA_VISIBLE_DEVICES=7 python opt_delta.py \
+    --dataset wikitext2 \
+    --wbits 4 \
+    --delta \
+    --save-hf \
+    --groupsize 1024 &
\ No newline at end of file
diff --git a/submit.py b/submit.py
new file mode 100644
index 0000000..075e8cb
--- /dev/null
+++ b/submit.py
@@ -0,0 +1,29 @@
+import os
+model_relations = {
+    # 'facebook/opt-350m': ['lnair/opt-350m-wikitext2'],
+    # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'],
+    # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'],
+    'facebook/opt-6.7b': ['KoboldAI/OPT-6.7B-Erebus'],
+    'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'],
+    'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus']
+}
+
+wbits_settings = [2,3,4]
+
+sparsity_settings = [0, 0.95, 0.99]
+os.system("ts -S 8")
+for model in model_relations.keys():
+    for target_model in model_relations[model]:
+        for wbits in wbits_settings:
+            for sparsity in sparsity_settings:
+                if sparsity == 0:
+                    cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --save-hf --groupsize 1024"
+                else:
+                    cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --sparsify_hard_threshold --fraction_of_zero {sparsity} --save-hf --groupsize 1024"
+                os.system(cmd)
+
+for model in model_relations.keys():
+    for target_model in model_relations[model]:
+        for wbits in wbits_settings:
+            cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --model {target_model} --base-model {model} --save-hf --groupsize 1024"
+            os.system(cmd)
\ No newline at end of file

From ce294310c97cc6c6e464ccee673d12718f66629c Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 11 May 2023 12:54:30 +0000
Subject: [PATCH 11/23] minor

---
 .gitignore               |  4 ++-
 modelutils.py            |  2 +-
 opt_delta.py             | 19 ++++++++------
 scripts/playground.ipynb | 53 ++++++++++++++--------------------------
 submit.py                |  7 +++---
 5 files changed, 37 insertions(+), 48 deletions(-)

diff --git a/.gitignore b/.gitignore
index dbd6338..e7f3a29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,6 @@ dist/
 .idea
 *.egg-info/
 *.safetensors
-outputs/
\ No newline at end of file
+outputs/
+outputs_past/
+packed_delta
\ No newline at end of file
diff --git a/modelutils.py b/modelutils.py
index c93410d..f9436c7 100644
--- a/modelutils.py
+++ b/modelutils.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from transformers import OPTForCausalLM
+from transformers import OPTForCausalLM, AutoModel, AutoTokenizer
 DEV = torch.device('cuda:0')
 
 def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
diff --git a/opt_delta.py b/opt_delta.py
index bb55f7f..401c88b 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -472,6 +472,7 @@ def sync():
 
 def main(args):
     print(args)
+    packed_delta = None
     num_params_saved_lr = 0
     num_params = 0
     if args.load:
@@ -506,19 +507,21 @@ def main(args):
             quantizers = opt_sequential(model, dataloader, DEV)
     
     if args.delta and args.wbits<16:
-        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+        for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())):
             if args.sparsify_hard_threshold:
                 print('Hard Thresholding...')
                 W = finetuned_p.data
                 finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
-            if args.rank>0 and len(finetuned_p.shape) == 2:
-                print('Finding Low Rank Approximation...')
-                A = finetuned_p.data.float()
-                U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
-                A  = U @ torch.diag_embed(S) @ Vh.T
-                finetuned_p.data =  A.half()
-                num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
+            # if args.rank>0 and len(finetuned_p.shape) == 2:
+            #     print('Finding Low Rank Approximation...')
+            #     A = finetuned_p.data.float()
+            #     U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
+            #     A  = U @ torch.diag_embed(S) @ Vh.T
+            #     finetuned_p.data =  A.half()
+            #     num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
             num_params += torch.numel(finetuned_p.data)
+            # here we save a copy to pack, and save the delta only on disk
+            packed_delta = copy.deepcopy(finetuned_p.data)
             finetuned_p.data = (base_p.data + finetuned_p.data).clone()
 
     if args.benchmark:
diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb
index 88e2175..114a004 100644
--- a/scripts/playground.ipynb
+++ b/scripts/playground.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -25,42 +25,25 @@
      "output_type": "stream",
      "text": [
       "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 930/930 [00:00<00:00, 143kB/s]\n",
+      "You are using a model of type gptj to instantiate a model of type opt. This is not supported for all configurations of models and can yield errors.\n",
+      "Downloading pytorch_model.bin: 100%|██████████| 24.2G/24.2G [02:56<00:00, 137MB/s] \n"
      ]
     },
     {
-     "data": {
-      "text/plain": [
-       "OPTForCausalLM(\n",
-       "  (model): OPTModel(\n",
-       "    (decoder): OPTDecoder(\n",
-       "      (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n",
-       "      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n",
-       "      (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "      (layers): ModuleList(\n",
-       "        (0-23): 24 x OPTDecoderLayer(\n",
-       "          (self_attn): OPTAttention(\n",
-       "            (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "          )\n",
-       "          (activation_fn): ReLU()\n",
-       "          (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "          (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
-       "          (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
-       "          (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n",
-       ")"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
+     "ename": "ValueError",
+     "evalue": "The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mmodelutils\u001b[39;00m \u001b[39mimport\u001b[39;00m get_opt\n\u001b[0;32m----> 2\u001b[0m base_model \u001b[39m=\u001b[39m get_opt(base_model_name)\n\u001b[1;32m      3\u001b[0m target_model \u001b[39m=\u001b[39m get_opt(target_model_name)\n\u001b[1;32m      4\u001b[0m base_model\u001b[39m.\u001b[39mto(\u001b[39m'\u001b[39m\u001b[39mcuda\u001b[39m\u001b[39m'\u001b[39m)\n",
+      "File \u001b[0;32m~/project/fmzip/scripts/../modelutils.py:24\u001b[0m, in \u001b[0;36mget_opt\u001b[0;34m(model)\u001b[0m\n\u001b[1;32m     21\u001b[0m torch\u001b[39m.\u001b[39mnn\u001b[39m.\u001b[39minit\u001b[39m.\u001b[39mnormal_ \u001b[39m=\u001b[39m skip\n\u001b[1;32m     23\u001b[0m \u001b[39m# model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m model \u001b[39m=\u001b[39m OPTForCausalLM\u001b[39m.\u001b[39;49mfrom_pretrained(model, torch_dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat16)\n\u001b[1;32m     25\u001b[0m model\u001b[39m.\u001b[39mseqlen \u001b[39m=\u001b[39m model\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mmax_position_embeddings\n\u001b[1;32m     26\u001b[0m \u001b[39mreturn\u001b[39;00m model\n",
+      "File \u001b[0;32m~/miniconda3/envs/fmzip/lib/python3.9/site-packages/transformers/modeling_utils.py:2795\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m   2785\u001b[0m     \u001b[39mif\u001b[39;00m dtype_orig \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   2786\u001b[0m         torch\u001b[39m.\u001b[39mset_default_dtype(dtype_orig)\n\u001b[1;32m   2788\u001b[0m     (\n\u001b[1;32m   2789\u001b[0m         model,\n\u001b[1;32m   2790\u001b[0m         missing_keys,\n\u001b[1;32m   2791\u001b[0m         unexpected_keys,\n\u001b[1;32m   2792\u001b[0m         mismatched_keys,\n\u001b[1;32m   2793\u001b[0m         offload_index,\n\u001b[1;32m   2794\u001b[0m         error_msgs,\n\u001b[0;32m-> 2795\u001b[0m     ) \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49m_load_pretrained_model(\n\u001b[1;32m   2796\u001b[0m         model,\n\u001b[1;32m   2797\u001b[0m         state_dict,\n\u001b[1;32m   2798\u001b[0m         loaded_state_dict_keys,  \u001b[39m# XXX: rename?\u001b[39;49;00m\n\u001b[1;32m   2799\u001b[0m         resolved_archive_file,\n\u001b[1;32m   2800\u001b[0m         pretrained_model_name_or_path,\n\u001b[1;32m   2801\u001b[0m         ignore_mismatched_sizes\u001b[39m=\u001b[39;49mignore_mismatched_sizes,\n\u001b[1;32m   2802\u001b[0m         sharded_metadata\u001b[39m=\u001b[39;49msharded_metadata,\n\u001b[1;32m   2803\u001b[0m         _fast_init\u001b[39m=\u001b[39;49m_fast_init,\n\u001b[1;32m   2804\u001b[0m         low_cpu_mem_usage\u001b[39m=\u001b[39;49mlow_cpu_mem_usage,\n\u001b[1;32m   2805\u001b[0m         device_map\u001b[39m=\u001b[39;49mdevice_map,\n\u001b[1;32m   2806\u001b[0m         offload_folder\u001b[39m=\u001b[39;49moffload_folder,\n\u001b[1;32m   2807\u001b[0m         offload_state_dict\u001b[39m=\u001b[39;49moffload_state_dict,\n\u001b[1;32m   2808\u001b[0m         dtype\u001b[39m=\u001b[39;49mtorch_dtype,\n\u001b[1;32m   2809\u001b[0m         load_in_8bit\u001b[39m=\u001b[39;49mload_in_8bit,\n\u001b[1;32m   2810\u001b[0m         keep_in_fp32_modules\u001b[39m=\u001b[39;49mkeep_in_fp32_modules,\n\u001b[1;32m   2811\u001b[0m     )\n\u001b[1;32m   2813\u001b[0m model\u001b[39m.\u001b[39mis_loaded_in_8bit \u001b[39m=\u001b[39m load_in_8bit\n\u001b[1;32m   2815\u001b[0m \u001b[39m# make sure token embedding weights are still tied if needed\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/fmzip/lib/python3.9/site-packages/transformers/modeling_utils.py:3008\u001b[0m, in \u001b[0;36mPreTrainedModel._load_pretrained_model\u001b[0;34m(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, load_in_8bit, keep_in_fp32_modules)\u001b[0m\n\u001b[1;32m   3006\u001b[0m base_model_expected_keys \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(model_to_load\u001b[39m.\u001b[39mstate_dict()\u001b[39m.\u001b[39mkeys())\n\u001b[1;32m   3007\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39many\u001b[39m(key \u001b[39min\u001b[39;00m expected_keys_not_prefixed \u001b[39mand\u001b[39;00m key \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m base_model_expected_keys \u001b[39mfor\u001b[39;00m key \u001b[39min\u001b[39;00m loaded_keys):\n\u001b[0;32m-> 3008\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m   3009\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mThe state dictionary of the model you are trying to load is corrupted. Are you sure it was \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m   3010\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mproperly saved?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m   3011\u001b[0m     )\n\u001b[1;32m   3012\u001b[0m \u001b[39mif\u001b[39;00m device_map \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   3013\u001b[0m     device_map \u001b[39m=\u001b[39m {k\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mbase_model_prefix\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m): v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m device_map\u001b[39m.\u001b[39mitems()}\n",
+      "\u001b[0;31mValueError\u001b[0m: The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?"
+     ]
     }
    ],
    "source": [
diff --git a/submit.py b/submit.py
index 075e8cb..ce84f1f 100644
--- a/submit.py
+++ b/submit.py
@@ -3,9 +3,10 @@
     # 'facebook/opt-350m': ['lnair/opt-350m-wikitext2'],
     # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'],
     # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'],
-    'facebook/opt-6.7b': ['KoboldAI/OPT-6.7B-Erebus'],
-    'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'],
-    'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus']
+    'facebook/opt-6.7b': ['mit-han-lab/opt-6.7b-smoothquant'],
+    # 'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'],
+    # 'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus']
+    # 'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b']
 }
 
 wbits_settings = [2,3,4]

From d75ff123a8b402961fb9d31cbebc6390704d4342 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 11 May 2023 17:33:47 +0000
Subject: [PATCH 12/23] pack utils

---
 pack_utils.py      | 129 +++++++++++++++++++++++++++++++++++++++++++++
 pack_utils_test.py |  35 ++++++++++++
 quant.py           |   2 +-
 requirements.txt   |   3 +-
 4 files changed, 167 insertions(+), 2 deletions(-)
 create mode 100644 pack_utils.py
 create mode 100644 pack_utils_test.py

diff --git a/pack_utils.py b/pack_utils.py
new file mode 100644
index 0000000..1305c05
--- /dev/null
+++ b/pack_utils.py
@@ -0,0 +1,129 @@
+import math
+import torch
+import numpy as np
+from typing import Any
+from quant import Quantizer
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+def pack_to_bits(
+        weight: torch.Tensor,
+        quantizer:Quantizer,
+        bits: int,
+        groupsize = 1024
+    ):
+    if bits not in [2,3,4,8]:
+        raise ValueError("bits must be one of [2,3,4,8]")
+    scales = quantizer.scale.t().contiguous()
+    zeros = quantizer.zero.t().contiguous()
+    scale_zeros = zeros * scales
+    intweight = []
+    for idx in range(weight.shape[0]):
+        g_idx = idx // groupsize
+        intweight.append(torch.round((weight[:,idx] + scale_zeros[g_idx]) / scales[g_idx]).to(torch.int)[:,None])
+    intweight = torch.cat(intweight, dim=1)
+    intweight = intweight.t().contiguous()
+    intweight = intweight.numpy().astype(np.uint32)
+    qweight = np.zeros(
+            (intweight.shape[0] // 256 * (bits * 8), intweight.shape[1]), dtype=np.uint32
+    )
+    i = 0
+    row = 0
+    while row < qweight.shape[0]:
+        if bits in [2,4,8]:
+            for j in range(i, i + (32//bits)):
+                qweight[row] |= intweight[j] << (bits * (j - i))
+            i += 32//bits
+            row += 1
+        elif bits == 3:
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i))
+            i += 10
+            qweight[row] |= intweight[i] << 30
+            row += 1
+            qweight[row] |= (intweight[i] >> 2) & 1
+            i += 1
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+            i += 10
+            qweight[row] |= intweight[i] << 31
+            row += 1
+            qweight[row] |= (intweight[i] >> 1) & 0x3
+            i += 1
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+            i += 10
+            row += 1
+    
+    qweight = qweight.astype(np.int32)
+    qweight = torch.from_numpy(qweight)
+    zeros -= 1;
+    zeros = zeros.numpy().astype(np.uint32)
+    qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (bits * 8)), dtype=np.uint32)
+    i = 0
+    col = 0
+    while col < qzeros.shape[1]:
+        if bits in [2,4,8]:
+            for j in range(i, i + (32//bits)):
+                qzeros[:, col] |= zeros[:, j] << (bits * (j - i))
+            i += 32//bits
+            col += 1
+        elif bits == 3:
+            for j in range(i, i + 10):
+                qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+            i += 10
+            qzeros[:, col] |= zeros[:, i] << 30
+            col += 1
+            qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+            i += 1
+            for j in range(i, i + 10):
+                qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+            i += 10
+            qzeros[:, col] |= zeros[:, i] << 31
+            col += 1
+            qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+            i += 1
+            for j in range(i, i + 10):
+                qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+            i += 10
+            col += 1
+    qzeros = qzeros.astype(np.int32)
+    qzeros = torch.from_numpy(qzeros) 
+    return qweight, qzeros
+
+class SparseTensor():
+    def __init__(self, m: torch.Tensor, format: str, packing_bits: None) -> None:
+        self.m = m
+        self.size = m.size()
+        self.packing_bits = packing_bits
+        self.format = format
+        self._convert()
+
+    def _convert(self):
+        # flatten the matrix
+        self.m = self.m.flatten()
+        # get the indices of the non-zero elements
+        indices = torch.nonzero(self.m)
+        # get the non-zero elements
+        values = self.m[indices]
+        self.payload = {
+            'indices': indices,
+            'values': values
+        }
+
+    def restore(self):
+        # restore the matrix from the self.payload
+        self.m = torch.zeros(math.prod(self.size), dtype=self.payload['values'].dtype)
+        self.m[self.payload['indices']] = self.payload['values']
+        self.m = self.m.reshape(self.size)
+
+    def to_disk(self, path):
+        save_file(self.payload, path)
+
+    def from_disk(self, path):
+        tensors = {}
+        with safe_open(path, framework='pt', device='cpu') as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key)
+        self.payload = tensors
+        self.restore()
\ No newline at end of file
diff --git a/pack_utils_test.py b/pack_utils_test.py
new file mode 100644
index 0000000..483c91e
--- /dev/null
+++ b/pack_utils_test.py
@@ -0,0 +1,35 @@
+import torch
+from quant import quantize, Quantizer
+from safetensors import safe_open
+from pack_utils import SparseTensor, pack_to_bits, unpack_from_bits
+from safetensors.torch import save_file
+
+QUANTIZED_BITS = 3
+
+
+if __name__=="__main__":
+    torch.set_printoptions(precision=12)
+    b = torch.rand((1024, 1024), dtype=torch.float16)
+
+    quantizer = Quantizer()
+    quantizer.configure(
+        QUANTIZED_BITS, perchannel=True, sym=False, mse=False
+    )
+    quantizer.find_params(b, weight=True)
+    b_q = quantizer.quantize(b)
+
+    # count how many zeroes 
+    print(b_q)
+    # sparsification
+    
+    # now pack it
+    q_weight, qzero = pack_to_bits(b_q, quantizer, QUANTIZED_BITS)
+    unpacked_weight = unpack_from_bits(
+        qweight=q_weight,
+        qzeros=qzero,
+        quantizer=quantizer,
+        bits=QUANTIZED_BITS,
+        groupsize=1024,
+    )
+    print(unpacked_weight)
+    # count how many zeroes
diff --git a/quant.py b/quant.py
index f8cc1b7..f23099a 100644
--- a/quant.py
+++ b/quant.py
@@ -287,7 +287,7 @@ def pack(self, linear, scales, zeros):
                 raise NotImplementedError("Only 2,3,4,8 bits are supported.")
                 
         qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight) 
+        self.qweight = torch.from_numpy(qweight)
         
         zeros -= 1;
         zeros = zeros.numpy().astype(np.uint32)
diff --git a/requirements.txt b/requirements.txt
index 7417000..79d456c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 transformers
 loguru
-datasets
\ No newline at end of file
+datasets
+safetensors
\ No newline at end of file

From 23e35487f92294f2aa0a5e88e746debc25bee737 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 11 May 2023 20:11:05 +0000
Subject: [PATCH 13/23] minor

---
 .gitignore            |   3 +-
 .vscode/settings.json |   6 ++
 pack_utils.py         |  80 ++++++++++++++----------
 pack_utils_test.py    |  31 +++++----
 playground.py         | 142 ++++++++++++++++++++++++++++++++++++++++++
 quant_cuda.cpp        |   2 +-
 6 files changed, 213 insertions(+), 51 deletions(-)
 create mode 100644 .vscode/settings.json
 create mode 100644 playground.py

diff --git a/.gitignore b/.gitignore
index e7f3a29..6eb567c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ dist/
 *.safetensors
 outputs/
 outputs_past/
-packed_delta
\ No newline at end of file
+packed_delta
+.cache
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..d99f2f3
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}
\ No newline at end of file
diff --git a/pack_utils.py b/pack_utils.py
index 1305c05..307449f 100644
--- a/pack_utils.py
+++ b/pack_utils.py
@@ -21,6 +21,7 @@ def pack_to_bits(
     for idx in range(weight.shape[0]):
         g_idx = idx // groupsize
         intweight.append(torch.round((weight[:,idx] + scale_zeros[g_idx]) / scales[g_idx]).to(torch.int)[:,None])
+
     intweight = torch.cat(intweight, dim=1)
     intweight = intweight.t().contiguous()
     intweight = intweight.numpy().astype(np.uint32)
@@ -35,6 +36,7 @@ def pack_to_bits(
                 qweight[row] |= intweight[j] << (bits * (j - i))
             i += 32//bits
             row += 1
+            
         elif bits == 3:
             for j in range(i, i + 10):
                 qweight[row] |= intweight[j] << (3 * (j - i))
@@ -57,49 +59,61 @@ def pack_to_bits(
     
     qweight = qweight.astype(np.int32)
     qweight = torch.from_numpy(qweight)
-    zeros -= 1;
-    zeros = zeros.numpy().astype(np.uint32)
-    qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (bits * 8)), dtype=np.uint32)
+    return qweight
+
+def unpack_from_bits(
+        qweight: torch.Tensor,
+        quantizer:Quantizer,
+        bits: int,
+        groupsize = 1024
+    ):
+    if bits not in [2,3,4,8]:
+        raise ValueError("bits must be one of [2,3,4,8]")
+
+    scales = quantizer.scale.t().contiguous()
+    zeros = quantizer.zero.t().contiguous()
+    scale_zeros = zeros * scales
+    qweight = qweight.numpy().astype(np.uint32)
+    
+    intweight = np.zeros(
+            (qweight.shape[0] // (bits * 8) * 256, qweight.shape[1]), dtype=np.uint32
+    )
     i = 0
-    col = 0
-    while col < qzeros.shape[1]:
+    row = 0
+    while row < qweight.shape[0]:
         if bits in [2,4,8]:
-            for j in range(i, i + (32//bits)):
-                qzeros[:, col] |= zeros[:, j] << (bits * (j - i))
-            i += 32//bits
-            col += 1
-        elif bits == 3:
-            for j in range(i, i + 10):
-                qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
-            i += 10
-            qzeros[:, col] |= zeros[:, i] << 30
-            col += 1
-            qzeros[:, col] |= (zeros[:, i] >> 2) & 1
-            i += 1
-            for j in range(i, i + 10):
-                qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
-            i += 10
-            qzeros[:, col] |= zeros[:, i] << 31
-            col += 1
-            qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
-            i += 1
-            for j in range(i, i + 10):
-                qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
-            i += 10
-            col += 1
-    qzeros = qzeros.astype(np.int32)
-    qzeros = torch.from_numpy(qzeros) 
-    return qweight, qzeros
+            for j in range(i + 32//bits, i):
+                intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1)
+            i -= 32//bits
+            row += 1
+
+    intweight = intweight.astype(np.int32)
+    intweight = torch.from_numpy(intweight).t().contiguous()
+
+    weight = []
+    for idx in range(intweight.shape[0]):
+        g_idx = idx // groupsize
+        weight.append((intweight[:,idx] * scales[g_idx] - scale_zeros[g_idx]).to(torch.float32)[:,None])
+
+    weight = torch.cat(weight, dim=1)
+    return weight
 
 class SparseTensor():
-    def __init__(self, m: torch.Tensor, format: str, packing_bits: None) -> None:
+    def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None:
         self.m = m
         self.size = m.size()
-        self.packing_bits = packing_bits
+        self.minifloats = minifloats
         self.format = format
         self._convert()
 
     def _convert(self):
+        if self.minifloats>=2:
+            quantizer = Quantizer()
+            quantizer.configure(
+                self.minifloats, perchannel=True, sym=False, mse=False
+            )
+            quantizer.find_params(self.m, weight=True)
+            self.m = quantizer.quantize(self.m)
         # flatten the matrix
         self.m = self.m.flatten()
         # get the indices of the non-zero elements
diff --git a/pack_utils_test.py b/pack_utils_test.py
index 483c91e..1e1bce6 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -3,33 +3,32 @@
 from safetensors import safe_open
 from pack_utils import SparseTensor, pack_to_bits, unpack_from_bits
 from safetensors.torch import save_file
+from opt_delta import hard_threshold
 
-QUANTIZED_BITS = 3
+QUANTIZED_BITS = 2
 
 
 if __name__=="__main__":
-    torch.set_printoptions(precision=12)
-    b = torch.rand((1024, 1024), dtype=torch.float16)
-
+    torch.set_printoptions(precision=4)
+    b = torch.rand((2048, 2048), dtype=torch.float32)
+    # save b
+    save_file({'wb1': b}, '.cache/b.safetensor')
     quantizer = Quantizer()
     quantizer.configure(
         QUANTIZED_BITS, perchannel=True, sym=False, mse=False
     )
     quantizer.find_params(b, weight=True)
     b_q = quantizer.quantize(b)
-
-    # count how many zeroes 
-    print(b_q)
+    # sparsed_b_q = hard_threshold(b_q, 0.99)
+    # count how many zeroes
     # sparsification
-    
     # now pack it
-    q_weight, qzero = pack_to_bits(b_q, quantizer, QUANTIZED_BITS)
-    unpacked_weight = unpack_from_bits(
-        qweight=q_weight,
-        qzeros=qzero,
-        quantizer=quantizer,
-        bits=QUANTIZED_BITS,
-        groupsize=1024,
-    )
+    q_weight = pack_to_bits(b_q, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0])
+    unpacked_weight = unpack_from_bits(q_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0])
+    # check if it's the same
+    print(b)
     print(unpacked_weight)
+    print(b-unpacked_weight)
     # count how many zeroes
+    sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1)
+    sparse_t.to_disk('.cache/sparse_b.safetensor')
\ No newline at end of file
diff --git a/playground.py b/playground.py
new file mode 100644
index 0000000..67818cb
--- /dev/null
+++ b/playground.py
@@ -0,0 +1,142 @@
+import torch
+import numpy as np
+import torch.nn as nn
+
+def quantize(x, scale, zero, maxq):
+    if maxq < 0:
+        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+class Quantizer(nn.Module):
+
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer('maxq', torch.tensor(0))
+        self.register_buffer('scale', torch.zeros(shape))
+        self.register_buffer('zero', torch.zeros(shape))
+
+    def configure(
+        self,
+        bits, perchannel=False, sym=True, 
+        mse=False, norm=2.4, grid=100, maxshrink=.8,
+    ):
+        self.maxq = torch.tensor(2 ** bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink 
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        if self.maxq < 0:
+          self.scale = xmax
+          self.zero = xmin
+        else:
+          self.scale = (xmax - xmin) / self.maxq
+          if self.sym:
+              self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+          else:
+              self.zero = torch.round(-xmin / self.scale)
+        
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            print(shape)
+            print(self.scale)
+            print(self.zero)
+            return
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2):
+    if stochastic:
+        x_floor = x.floor()
+        th = x - x_floor
+        if minimum_stochastic_distance > 0:
+            th[th<minimum_stochastic_distance] = 0.
+            th[th>1-minimum_stochastic_distance] = 1.
+        pr = torch.rand_like(x)
+        x_floor += (pr < th)
+        return x_floor
+    else:
+        return x.round()
+
+def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), 
+                    stochastic=False, minimum_stochastic_distance=0.2):
+    
+    fbits = bits - 1
+    
+    if scale_method == 'max':
+        # issue: sensitive to outlier points
+        scale = x.abs().amax(scale_dims, keepdims=True)
+    elif scale_method == 'l2':
+        # ~95% confidence interval for normal distribution
+        scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 
+    else:
+        raise Exception('unkonwn scale method.')
+    # fp16 should be enough
+    scale = scale.half()
+    x = x / (scale + 1e-6)
+    
+    x = x.ldexp(torch.tensor(fbits))
+    clip_min = -(1<<fbits)
+    clip_max = (1<<fbits)-1
+
+    x = _rounding(x, stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
+    x = x.clip(clip_min, clip_max)
+    
+    x = x - clip_min
+    x = x.type(torch.uint8)
+    
+    return x, scale
+
+def compress_4bit(x, scale_method='max', scale_dims=(0,1)):
+
+    x, scale = _compress_nbits(x, bits=4, scale_method=scale_method, scale_dims=scale_dims)
+    
+    x0, x1 = x.chunk(2, -1)
+    x = (x0 << 4) + x1
+    
+    return x, scale
+
+q = torch.tensor([[1.0]]).float()
+# quantizer = Quantizer()
+# quantizer.configure(
+#     3, perchannel=True, sym=False, mse=False
+# )
+# quantizer.find_params(q, weight=True)
+# b_q = quantizer.quantize(q)
+two_bits = compress_4bit(1)
+print(two_bits)
\ No newline at end of file
diff --git a/quant_cuda.cpp b/quant_cuda.cpp
index 1bf0894..ff97571 100644
--- a/quant_cuda.cpp
+++ b/quant_cuda.cpp
@@ -10,7 +10,7 @@ void vecquant3matmul_cuda(
 void vecquant3matmul_faster_cuda(
   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
   torch::Tensor scales, torch::Tensor zeros
-); 
+);
 
 void vecquant3matmul(
   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,

From f764d61275e06ebe8b9dfa4f0b3db127207bd23f Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 11 May 2023 21:47:05 +0000
Subject: [PATCH 14/23] packing utils

---
 pack_utils.py      | 23 ++++++++++++--
 pack_utils_test.py |  6 ++--
 playground.py      | 74 +++++++---------------------------------------
 3 files changed, 35 insertions(+), 68 deletions(-)

diff --git a/pack_utils.py b/pack_utils.py
index 307449f..9f24ab9 100644
--- a/pack_utils.py
+++ b/pack_utils.py
@@ -82,9 +82,28 @@ def unpack_from_bits(
     row = 0
     while row < qweight.shape[0]:
         if bits in [2,4,8]:
-            for j in range(i + 32//bits, i):
+            for j in range(i, i+ 32 // bits):
                 intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1)
-            i -= 32//bits
+            i += 32//bits
+            row += 1
+        elif bits == 3:
+            for j in range(i, i+10):
+                intweight[j] = (qweight[row] >> (3 * (j - i))) & 7
+            i += 10
+            intweight[i] = (qweight[row] >> 30) & 1
+            row += 1
+            intweight[i] |= (qweight[row] & 1) << 2
+            i += 1
+            for j in range(i, i+10):
+                intweight[j] = (qweight[row] >> (3 * (j - i) + 1)) & 7
+            i += 10
+            intweight[i] = (qweight[row] >> 31) & 1
+            row += 1
+            intweight[i] |= (qweight[row] & 3) << 1
+            i += 1
+            for j in range(i, i+10):
+                intweight[j] = (qweight[row] >> (3 * (j - i) + 2)) & 7
+            i += 10
             row += 1
 
     intweight = intweight.astype(np.int32)
diff --git a/pack_utils_test.py b/pack_utils_test.py
index 1e1bce6..0af7e2d 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -5,7 +5,7 @@
 from safetensors.torch import save_file
 from opt_delta import hard_threshold
 
-QUANTIZED_BITS = 2
+QUANTIZED_BITS = 3
 
 
 if __name__=="__main__":
@@ -30,5 +30,5 @@
     print(unpacked_weight)
     print(b-unpacked_weight)
     # count how many zeroes
-    sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1)
-    sparse_t.to_disk('.cache/sparse_b.safetensor')
\ No newline at end of file
+    # sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1)
+    # sparse_t.to_disk('.cache/sparse_b.safetensor')
\ No newline at end of file
diff --git a/playground.py b/playground.py
index 67818cb..ebeecac 100644
--- a/playground.py
+++ b/playground.py
@@ -64,9 +64,6 @@ def find_params(self, x, weight=False):
             shape = [-1] + [1] * (len(shape) - 1)
             self.scale = self.scale.reshape(shape)
             self.zero = self.zero.reshape(shape)
-            print(shape)
-            print(self.scale)
-            print(self.zero)
             return
 
     def quantize(self, x):
@@ -80,63 +77,14 @@ def enabled(self):
     def ready(self):
         return torch.all(self.scale != 0)
 
-def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2):
-    if stochastic:
-        x_floor = x.floor()
-        th = x - x_floor
-        if minimum_stochastic_distance > 0:
-            th[th<minimum_stochastic_distance] = 0.
-            th[th>1-minimum_stochastic_distance] = 1.
-        pr = torch.rand_like(x)
-        x_floor += (pr < th)
-        return x_floor
-    else:
-        return x.round()
-
-def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), 
-                    stochastic=False, minimum_stochastic_distance=0.2):
-    
-    fbits = bits - 1
-    
-    if scale_method == 'max':
-        # issue: sensitive to outlier points
-        scale = x.abs().amax(scale_dims, keepdims=True)
-    elif scale_method == 'l2':
-        # ~95% confidence interval for normal distribution
-        scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 
-    else:
-        raise Exception('unkonwn scale method.')
-    # fp16 should be enough
-    scale = scale.half()
-    x = x / (scale + 1e-6)
-    
-    x = x.ldexp(torch.tensor(fbits))
-    clip_min = -(1<<fbits)
-    clip_max = (1<<fbits)-1
-
-    x = _rounding(x, stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
-    x = x.clip(clip_min, clip_max)
-    
-    x = x - clip_min
-    x = x.type(torch.uint8)
-    
-    return x, scale
-
-def compress_4bit(x, scale_method='max', scale_dims=(0,1)):
-
-    x, scale = _compress_nbits(x, bits=4, scale_method=scale_method, scale_dims=scale_dims)
-    
-    x0, x1 = x.chunk(2, -1)
-    x = (x0 << 4) + x1
-    
-    return x, scale
-
-q = torch.tensor([[1.0]]).float()
-# quantizer = Quantizer()
-# quantizer.configure(
-#     3, perchannel=True, sym=False, mse=False
-# )
-# quantizer.find_params(q, weight=True)
-# b_q = quantizer.quantize(q)
-two_bits = compress_4bit(1)
-print(two_bits)
\ No newline at end of file
+q = torch.tensor([[1,2,3,4,5,6,7,8]]).float()
+quantizer = Quantizer()
+quantizer.configure(
+    3, perchannel=True, sym=False, mse=False
+)
+quantizer.find_params(q, weight=True)
+b_q = quantizer.quantize(q)
+# now since b_q is 8 3-bit floats, we can pack them into 3 8-bit integers
+packed_b_q = torch.zeros(3, dtype=torch.uint8)
+for i in range(3):
+    packed_b_q[i] = b_q[0][i*8:(i+1)*8].byte().sum()

From b5015ad52cfcfadcde35d536b5cc53425a331758 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 11 May 2023 22:47:02 +0000
Subject: [PATCH 15/23] packing

---
 pack_utils.py      | 2 +-
 pack_utils_test.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pack_utils.py b/pack_utils.py
index 9f24ab9..cda817c 100644
--- a/pack_utils.py
+++ b/pack_utils.py
@@ -84,7 +84,7 @@ def unpack_from_bits(
         if bits in [2,4,8]:
             for j in range(i, i+ 32 // bits):
                 intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1)
-            i += 32//bits
+            i += 32 // bits
             row += 1
         elif bits == 3:
             for j in range(i, i+10):
diff --git a/pack_utils_test.py b/pack_utils_test.py
index 0af7e2d..70dbb67 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -7,8 +7,11 @@
 
 QUANTIZED_BITS = 3
 
-
 if __name__=="__main__":
+    """
+    
+    """
+
     torch.set_printoptions(precision=4)
     b = torch.rand((2048, 2048), dtype=torch.float32)
     # save b

From c06f73d177f716a954bfa6fbabedbf50f3424729 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 11 May 2023 23:03:17 +0000
Subject: [PATCH 16/23] packing utils

---
 pack_utils.py      | 22 ++++++++++++++++------
 pack_utils_test.py | 37 ++++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/pack_utils.py b/pack_utils.py
index cda817c..a68407b 100644
--- a/pack_utils.py
+++ b/pack_utils.py
@@ -123,8 +123,7 @@ def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None:
         self.size = m.size()
         self.minifloats = minifloats
         self.format = format
-        self._convert()
-
+        
     def _convert(self):
         if self.minifloats>=2:
             quantizer = Quantizer()
@@ -141,7 +140,8 @@ def _convert(self):
         values = self.m[indices]
         self.payload = {
             'indices': indices,
-            'values': values
+            'values': values,
+            'size': torch.tensor(self.size),
         }
 
     def restore(self):
@@ -151,12 +151,22 @@ def restore(self):
         self.m = self.m.reshape(self.size)
 
     def to_disk(self, path):
+        self._convert()
         save_file(self.payload, path)
 
-    def from_disk(self, path):
+    @classmethod
+    def from_disk(cls, path):
         tensors = {}
         with safe_open(path, framework='pt', device='cpu') as f:
             for key in f.keys():
                 tensors[key] = f.get_tensor(key)
-        self.payload = tensors
-        self.restore()
\ No newline at end of file
+        m = torch.zeros(math.prod(tensors['size']), dtype=tensors['values'].dtype)
+        m[tensors['indices']] = tensors['values']
+        tensors['size'] = tensors['size'].tolist()
+        print(tensors['size'])
+        m = m.reshape(tensors['size'])
+        return cls(m, 'sparse', minifloats=-1)
+    
+    @property
+    def tensor(self):
+        return self.m
\ No newline at end of file
diff --git a/pack_utils_test.py b/pack_utils_test.py
index 70dbb67..a6befb4 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -9,29 +9,36 @@
 
 if __name__=="__main__":
     """
-    
+    The process:
+    1. Given a weight, quantize it first
+    2. Then do sparsification
+
+    To test our pack/unpack, we need to do the following:
+    1. After the sparsification, we pack the weight and store on disk
+    2. Compare the original weight with the unpacked weight
     """
 
     torch.set_printoptions(precision=4)
     b = torch.rand((2048, 2048), dtype=torch.float32)
     # save b
-    save_file({'wb1': b}, '.cache/b.safetensor')
+    save_file({'wb1': b}, '.cache/original_b.safetensor')
     quantizer = Quantizer()
     quantizer.configure(
         QUANTIZED_BITS, perchannel=True, sym=False, mse=False
     )
     quantizer.find_params(b, weight=True)
     b_q = quantizer.quantize(b)
-    # sparsed_b_q = hard_threshold(b_q, 0.99)
-    # count how many zeroes
-    # sparsification
-    # now pack it
-    q_weight = pack_to_bits(b_q, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0])
-    unpacked_weight = unpack_from_bits(q_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0])
-    # check if it's the same
-    print(b)
-    print(unpacked_weight)
-    print(b-unpacked_weight)
-    # count how many zeroes
-    # sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1)
-    # sparse_t.to_disk('.cache/sparse_b.safetensor')
\ No newline at end of file
+    sparsed_b_q = hard_threshold(b_q, 0.99)
+
+    q_weight = pack_to_bits(sparsed_b_q, quantizer, QUANTIZED_BITS, groupsize=sparsed_b_q.shape[0])
+    sparse_t = SparseTensor(sparsed_b_q, 'wb1', minifloats=-1)
+    sparse_t.to_disk('.cache/sparse_b.safetensor')
+    # now load it back
+    restored_sparse_t = SparseTensor.from_disk('.cache/sparse_b.safetensor')
+    restored_weight = restored_sparse_t.tensor
+    print(f"Original weight: {sparsed_b_q}")
+    print(f"Restored weight: {restored_weight}")
+    print(torch.allclose(sparsed_b_q, restored_weight))
+    # count the number of non-zero elements
+    print(f"Original weight: {sparsed_b_q.nonzero().shape[0]}")
+    print(f"Restored weight: {restored_weight.nonzero().shape[0]}")
\ No newline at end of file

From afd99db7cf582559aa79bba59728d324a22900a5 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Fri, 12 May 2023 06:12:55 +0000
Subject: [PATCH 17/23] packing

---
 .gitignore                   |   3 +-
 cli.py                       |   2 +-
 core_compression_parallel.py | 143 -----------------------------------
 opt_delta.py                 |  15 ++--
 pack_utils.py                |   9 +--
 pack_utils_test.py           |   7 +-
 scripts/gptq_delta.sh        | 107 +++++++++++++-------------
 tensorio.py                  |  54 +++++++++++++
 8 files changed, 127 insertions(+), 213 deletions(-)
 delete mode 100644 core_compression_parallel.py
 create mode 100644 tensorio.py

diff --git a/.gitignore b/.gitignore
index 6eb567c..87c07aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ dist/
 outputs/
 outputs_past/
 packed_delta
-.cache
\ No newline at end of file
+.cache
+delta_outputs/
\ No newline at end of file
diff --git a/cli.py b/cli.py
index 17fef43..b0757b0 100644
--- a/cli.py
+++ b/cli.py
@@ -6,8 +6,8 @@
 from modelutils import get_opt
 from evaluation import opt_eval
 from datautils import get_loaders
-from save_and_load import save_lr_tensors, load_lr_tensors
 from core_compression import opt_delta_lr
+from save_and_load import save_lr_tensors, load_lr_tensors
 
 @torch.no_grad()
 def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples, decompose_only=False):
diff --git a/core_compression_parallel.py b/core_compression_parallel.py
deleted file mode 100644
index fca3c7d..0000000
--- a/core_compression_parallel.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import torch
-import torch.nn as nn
-from tqdm import tqdm
-from matq import TensorQ
-from loguru import logger
-from quant import Quantizer
-from modelutils import find_layers
-import multiprocessing as mp
-@torch.no_grad()
-def opt_delta_lr(
-        model,
-        delta_model,
-        dataloader,
-        nsamples,
-        wbits,
-        sym,
-        trits,
-        rank,
-        args
-    ):
-    device = model.device
-    print("Starting LR quantizer initialization...")
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.decoder.layers
-    delta_layers = delta_model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(device)
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(device)
-
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(device)
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(device)
-    layers[0] = layers[0].to(device)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=device
-    )
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for batch in dataloader:
-        try:
-            model(batch[0].to(device))
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-    
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    original_outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-    
-    logger.info("Ready, creating lr quantizers...")
-    quantizers = {}
-    l_quantizers = {}
-    lr_tensors = {}
-    # parallelize this to allocate to multiple GPUs
-    def process_layer(i, device):
-        layer = delta_layers[i].to(device)
-        original_layer = layers[i].to(device)
-        subset = find_layers(layer)
-        lr_gptq = {}
-        for name in subset:
-            lr_gptq[name] = TensorQ(subset[name], rank)
-            lr_gptq[name].quantizer = Quantizer()
-            lr_gptq[name].quantizer.configure(
-                wbits,
-                perchannel=True,
-                sym=sym,
-                mse=False,
-                trits = trits,
-            )
-            lr_gptq[name].l_quantizer = Quantizer()
-            lr_gptq[name].l_quantizer.configure(
-                wbits,
-                perchannel=True,
-                sym=sym,
-                mse=False,
-                trits = trits,
-            )
-        def add_batch(name):
-            def temp(_, inp, out):
-                lr_gptq[name].add_batch_lr(inp[0].data, out.data)
-            return temp
-        handles = []
-        for name in subset:
-            handles.append(subset[name].register_forward_hook(add_batch(name)))
-        for j in range(nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-
-            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        
-        for h in handles:
-            h.remove()
-        
-        for name in subset:
-            logger.info(f"Quantizing {name}...")
-            lr_gptq[name].lr_quant(
-                percdamp = args['percdamp'],
-                groupsize = args['groupsize'],
-                actorder = args['actorder'],
-            )
-            lr_tensors[f'<R>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R
-            lr_tensors[f'<L>.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L
-            
-            quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].quantizer
-            l_quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].l_quantizer
-            lr_gptq[name].free()
-        
-        for j in range(nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        layers[i] = layer.cpu()
-        del layer
-        del lr_gptq
-        torch.cuda.empty_cache()
-        inps, outs = original_outs, inps
-    num_workers = torch.cuda.device_count()
-    logger.info(f"Using {num_workers} workers...")
-    with mp.Pool(num_workers) as p:
-        p.starmap(process_layer, [(i, f'cuda:{i}') for i in range(num_workers)])
-
-    model.config.use_cache = use_cache
-    return quantizers, l_quantizers, lr_tensors
\ No newline at end of file
diff --git a/opt_delta.py b/opt_delta.py
index 401c88b..8195606 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -3,12 +3,13 @@
 import torch
 import pickle
 import torch.nn as nn
-
+from pack_utils import pack_to_bits, unpack_from_bits
 from gptq import *
 from modelutils import *
 from quant import *
 from transformers import AutoTokenizer, AutoModel
 import copy
+from tensorio import TensorIO, model_packing
 #from prettytable import PrettyTable
 
 def get_opt(model):
@@ -472,7 +473,7 @@ def sync():
 
 def main(args):
     print(args)
-    packed_delta = None
+    tensor_io = TensorIO('sparse')
     num_params_saved_lr = 0
     num_params = 0
     if args.load:
@@ -505,7 +506,7 @@ def main(args):
             comp_time = time.time()-tick
         else:
             quantizers = opt_sequential(model, dataloader, DEV)
-    
+    print(quantizers)
     if args.delta and args.wbits<16:
         for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())):
             if args.sparsify_hard_threshold:
@@ -521,9 +522,10 @@ def main(args):
             #     num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
             num_params += torch.numel(finetuned_p.data)
             # here we save a copy to pack, and save the delta only on disk
-            packed_delta = copy.deepcopy(finetuned_p.data)
             finetuned_p.data = (base_p.data + finetuned_p.data).clone()
-
+    if args.save_delta:
+        new_weights = model_packing(model, quantizers, bits=args.wbits)
+        torch.save(new_weights, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz")
     if args.benchmark:
         gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
         if len(gpus) > 1:
@@ -649,6 +651,9 @@ def main(args):
         '--save-hf', action='store_true', default=False,
         help='Whether to save a huggingface model'
     )
+    parser.add_argument(
+        '--save-delta', action='store_true', default=False,
+    )
     parser.add_argument(
         '--fraction_of_zero', type=float, default=0.99,
         help='Sparsity ratio'
diff --git a/pack_utils.py b/pack_utils.py
index a68407b..5c6ace9 100644
--- a/pack_utils.py
+++ b/pack_utils.py
@@ -12,6 +12,8 @@ def pack_to_bits(
         bits: int,
         groupsize = 1024
     ):
+    if groupsize == -1:
+        groupsize = weight.shape[0]
     if bits not in [2,3,4,8]:
         raise ValueError("bits must be one of [2,3,4,8]")
     scales = quantizer.scale.t().contiguous()
@@ -125,13 +127,6 @@ def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None:
         self.format = format
         
     def _convert(self):
-        if self.minifloats>=2:
-            quantizer = Quantizer()
-            quantizer.configure(
-                self.minifloats, perchannel=True, sym=False, mse=False
-            )
-            quantizer.find_params(self.m, weight=True)
-            self.m = quantizer.quantize(self.m)
         # flatten the matrix
         self.m = self.m.flatten()
         # get the indices of the non-zero elements
diff --git a/pack_utils_test.py b/pack_utils_test.py
index a6befb4..a4130e3 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -28,17 +28,18 @@
     )
     quantizer.find_params(b, weight=True)
     b_q = quantizer.quantize(b)
-    sparsed_b_q = hard_threshold(b_q, 0.99)
+    sparsed_b_q = hard_threshold(b_q, 0.01)
 
     q_weight = pack_to_bits(sparsed_b_q, quantizer, QUANTIZED_BITS, groupsize=sparsed_b_q.shape[0])
-    sparse_t = SparseTensor(sparsed_b_q, 'wb1', minifloats=-1)
+    sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1)
     sparse_t.to_disk('.cache/sparse_b.safetensor')
     # now load it back
     restored_sparse_t = SparseTensor.from_disk('.cache/sparse_b.safetensor')
     restored_weight = restored_sparse_t.tensor
+    # this is what we restored from disk
+    restored_weight = unpack_from_bits(restored_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0])
     print(f"Original weight: {sparsed_b_q}")
     print(f"Restored weight: {restored_weight}")
-    print(torch.allclose(sparsed_b_q, restored_weight))
     # count the number of non-zero elements
     print(f"Original weight: {sparsed_b_q.nonzero().shape[0]}")
     print(f"Restored weight: {restored_weight.nonzero().shape[0]}")
\ No newline at end of file
diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh
index e73d717..f397417 100644
--- a/scripts/gptq_delta.sh
+++ b/scripts/gptq_delta.sh
@@ -5,64 +5,65 @@ CUDA_VISIBLE_DEVICES=0 python opt_delta.py \
     --delta \
     --sparsify_hard_threshold \
     --fraction_of_zero 0.95 \
+    --save-delta \
     --save-hf \
-    --groupsize 1024 &
+    --groupsize 1024
 
-CUDA_VISIBLE_DEVICES=1 python opt_delta.py \
-    --dataset wikitext2 \
-    --wbits 3 \
-    --delta \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.95 \
-    --save-hf \
-    --groupsize 1024 &
+# CUDA_VISIBLE_DEVICES=1 python opt_delta.py \
+#     --dataset wikitext2 \
+#     --wbits 3 \
+#     --delta \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.95 \
+#     --save-hf \
+#     --groupsize 1024 &
 
-CUDA_VISIBLE_DEVICES=2 python opt_delta.py \
-    --dataset wikitext2 \
-    --wbits 4 \
-    --delta \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.95 \
-    --save-hf \
-    --groupsize 1024 &
+# CUDA_VISIBLE_DEVICES=2 python opt_delta.py \
+#     --dataset wikitext2 \
+#     --wbits 4 \
+#     --delta \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.95 \
+#     --save-hf \
+#     --groupsize 1024 &
 
-CUDA_VISIBLE_DEVICES=3 python opt_delta.py \
-    --dataset wikitext2 \
-    --wbits 2 \
-    --delta \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.99 \
-    --save-hf \
-    --groupsize 1024 &
+# CUDA_VISIBLE_DEVICES=3 python opt_delta.py \
+#     --dataset wikitext2 \
+#     --wbits 2 \
+#     --delta \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.99 \
+#     --save-hf \
+#     --groupsize 1024 &
 
-CUDA_VISIBLE_DEVICES=4 python opt_delta.py \
-    --dataset wikitext2 \
-    --wbits 3 \
-    --delta \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.99 \
-    --save-hf \
-    --groupsize 1024 &
+# CUDA_VISIBLE_DEVICES=4 python opt_delta.py \
+#     --dataset wikitext2 \
+#     --wbits 3 \
+#     --delta \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.99 \
+#     --save-hf \
+#     --groupsize 1024 &
 
-CUDA_VISIBLE_DEVICES=5 python opt_delta.py \
-    --dataset wikitext2 \
-    --wbits 4 \
-    --delta \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.99 \
-    --save-hf \
-    --groupsize 1024 &
+# CUDA_VISIBLE_DEVICES=5 python opt_delta.py \
+#     --dataset wikitext2 \
+#     --wbits 4 \
+#     --delta \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.99 \
+#     --save-hf \
+#     --groupsize 1024 &
 
-CUDA_VISIBLE_DEVICES=6 python opt_delta.py \
-    --dataset wikitext2 \
-    --wbits 3 \
-    --delta \
-    --save-hf \
-    --groupsize 1024 &
+# CUDA_VISIBLE_DEVICES=6 python opt_delta.py \
+#     --dataset wikitext2 \
+#     --wbits 3 \
+#     --delta \
+#     --save-hf \
+#     --groupsize 1024 &
 
-CUDA_VISIBLE_DEVICES=7 python opt_delta.py \
-    --dataset wikitext2 \
-    --wbits 4 \
-    --delta \
-    --save-hf \
-    --groupsize 1024 &
\ No newline at end of file
+# CUDA_VISIBLE_DEVICES=7 python opt_delta.py \
+#     --dataset wikitext2 \
+#     --wbits 4 \
+#     --delta \
+#     --save-hf \
+#     --groupsize 1024 &
\ No newline at end of file
diff --git a/tensorio.py b/tensorio.py
new file mode 100644
index 0000000..26b5fab
--- /dev/null
+++ b/tensorio.py
@@ -0,0 +1,54 @@
+import math
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_model
+from modelutils import find_layers
+from pack_utils import pack_to_bits
+
+class TensorIO():
+    def __init__(self, format: str, tensors=None) -> None:
+        self.format = format
+        if tensors is None:
+            self.tensors = {}
+        else:
+            self.tensors = tensors
+    def add_tensor(self, idx, tensor):
+        tensor = tensor.flatten()
+        # assume that the tensor is sparse
+        indices = torch.nonzero(tensor)
+        values = tensor[indices]
+        self.tensors[f"{idx}_indices"] = indices
+        self.tensors[f"{idx}_values"] = values
+        self.tensors[f"{idx}_size"] = torch.tensor(tensor.size())
+    
+    def to_disk(self, path):
+        torch.save(self.tensors, path)
+
+    @classmethod
+    def from_disk(cls, path):
+        tensors = {}
+        with safe_open(path, framework='pt', device='cpu') as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key)
+        # restore the tensors
+        for key in tensors.keys():
+            m = torch.zeros(math.prod(tensors[f"{key}_size"]), dtype=tensors[f'{key}_values'].dtype)
+    
+            m[tensors[f"{key}_indices"]] = tensors[f"{key}_values"]
+            tensors[f"{key}_size"] = tensors[f"{key}_size"].tolist()
+            m = m.reshape(tensors[f"{key}_size"])
+
+            tensors[key] = m
+        return cls('sparse', tensors=tensors)
+
+def model_packing(model, quantizers, bits):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    qlayers = find_layers(model, )
+    print('Packing ...')
+    new_weights = {}
+    for name in qlayers:
+        quantizers[name] = quantizers[name].cpu()
+        new_weights[name] = pack_to_bits(layers[name].weight.data, quantizers[name], bits, groupsize=-1)
+    print('Done.')
+    return new_weights
\ No newline at end of file

From 750dcb018753b5c1c290a8a108f32714c7409c00 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Fri, 12 May 2023 13:07:03 +0000
Subject: [PATCH 18/23] autotuning

---
 compress_utils.py     | 351 +++++++++++++++++++++++
 gptq.py               |  44 +--
 opt_delta.py          |  22 +-
 opt_delta_autotune.py | 632 ++++++++++++++++++++++++++++++++++++++++++
 pack_utils_test.py    |   2 +-
 playground.py         | 110 ++------
 scripts/gptq_delta.sh |   2 +-
 tensorio.py           |  18 +-
 8 files changed, 1053 insertions(+), 128 deletions(-)
 create mode 100644 compress_utils.py
 create mode 100644 opt_delta_autotune.py

diff --git a/compress_utils.py b/compress_utils.py
new file mode 100644
index 0000000..20e20f1
--- /dev/null
+++ b/compress_utils.py
@@ -0,0 +1,351 @@
+import cupy
+import math
+import torch
+import numpy as np
+from torch.utils.dlpack import to_dlpack, from_dlpack
+from quant import Quantizer
+def cupy_to_tensor(x):
+    return from_dlpack(x.toDlpack())
+
+def tensor_to_cupy(x):
+    return cupy.fromDlpack(to_dlpack(x))
+
+def pack_uint8_tensor(x):
+    if x.device != torch.device('cpu'):
+        return cupy_to_tensor(
+            cupy.packbits(tensor_to_cupy(x))
+        )
+    else:
+        return torch.from_numpy(
+            np.packbits(x.numpy())
+        )
+
+def unpack_uint8_tensor(x):
+    if x.device != torch.device('cpu'):
+        return cupy_to_tensor(
+            cupy.unpackbits(tensor_to_cupy(x))
+        )
+    else:
+        return torch.from_numpy(
+            np.unpackbits(x.numpy())
+        )
+
+def pack_low_bit_tensor(x, bits):
+    
+    if x.device != torch.device('cpu'):
+        assert x.dtype == torch.uint8
+        y = cupy.packbits(
+            cupy.unpackbits(tensor_to_cupy(x)).reshape(*x.shape, 8)[..., -bits:]
+        )
+        y = cupy_to_tensor(y)
+    else:
+        y = np.packbits(
+            np.unpackbits(x.numpy()).reshape(*x.shape, 8)[..., -bits:]
+        )
+        y = torch.from_numpy(y)
+        
+    return y
+
+def unpack_low_bit_tensor(x, bits, original_shape):
+    if x.device != torch.device('cpu'):
+        y = cupy.packbits(cupy.pad(
+            cupy.unpackbits(
+                tensor_to_cupy(x)
+            )[:np.prod(original_shape)*bits].reshape(-1, bits),
+            ((0,0), (8-bits, 0))
+        ))
+        y = cupy_to_tensor(y).view(original_shape)
+    else:
+        y = np.packbits(np.pad(
+            np.unpackbits(
+                x.numpy()
+            )[:np.prod(original_shape)*bits].reshape(-1, bits),
+            ((0,0), (8-bits, 0))
+        ))
+        y = torch.from_numpy(y).view(original_shape)
+    return y
+
+
+def pin_memory(array):
+    mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
+    ret = np.frombuffer(mem, array.dtype, array.size).reshape(array.shape)
+    ret[...] = array
+    return ret
+
+
+def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2):
+    if stochastic:
+        x_floor = x.floor()
+        th = x - x_floor
+        if minimum_stochastic_distance > 0:
+            th[th<minimum_stochastic_distance] = 0.
+            th[th>1-minimum_stochastic_distance] = 1.
+        pr = torch.rand_like(x)
+        x_floor += (pr < th)
+        return x_floor
+    else:
+        return x.round()
+
+
+def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), 
+                    stochastic=False, minimum_stochastic_distance=0.2):
+    
+    fbits = bits - 1
+    
+    if scale_method == 'max':
+        # issue: sensitive to outlier points
+        scale = x.abs().amax(scale_dims, keepdims=True)
+    elif scale_method == 'l2':
+        # ~95% confidence interval for normal distribution
+        scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 
+    else:
+        raise Exception('unkonwn scale method.')
+    # fp16 should be enough
+    scale = scale.half()
+    x = x / (scale + 1e-6)
+    
+    x = x.ldexp(torch.tensor(fbits))
+    clip_min = -(1<<fbits)
+    clip_max = (1<<fbits)-1
+
+    x = _rounding(x, stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
+    x = x.clip(clip_min, clip_max)
+    
+    x = x - clip_min
+    x = x.type(torch.uint8)
+    
+    return x, scale
+
+
+def _decompress_nbits(x, scale, bits):
+    
+    fbits = bits - 1
+    
+    clip_min = -(1<<fbits)
+    clip_max = (1<<fbits)-1
+    
+    x = x.float() + clip_min
+    
+    x = x / (clip_max+1) * scale
+    
+    return x
+
+
+def compress_8bit(x, scale_method='max', scale_dims=(0,1)):
+
+    x, scale = _compress_nbits(x, bits=8, scale_method=scale_method, scale_dims=scale_dims)
+    
+    return x, scale
+
+
+def decompress_8bit(x, scale):
+    
+    x = _decompress_nbits(x, scale, bits=8)
+    
+    return x
+
+def compress_4bit(x, scale_method='max', scale_dims=(0,1)):
+
+    x, scale = _compress_nbits(x, bits=4, scale_method=scale_method, scale_dims=scale_dims)
+    
+    x0, x1 = x.chunk(2, -1)
+    x = (x0 << 4) + x1
+    
+    return x, scale
+
+
+def decompress_4bit(x, scale):
+    
+    bitmask = 15
+    
+    x0 = (x >> 4)
+    x1 = (x & bitmask)
+    
+    x = torch.cat([x0, x1], -1)
+    
+    x = _decompress_nbits(x, scale, bits=4)
+    
+    return x
+
+
+def compress_2bit(x, scale_method='max', scale_dims=(0,1)):
+
+    x, scale = _compress_nbits(x, bits=2, scale_method=scale_method, scale_dims=scale_dims)
+    
+    x0, x1, x2, x3 = x.chunk(4, -1)
+    x = (x0 << 6) + (x1 << 4) + (x2 << 2) + x3
+    
+    return x, scale
+
+
+def decompress_2bit(x, scale):
+    
+    bitmask = 3
+    
+    x0 = (x >> 6)
+    x1 = (x >> 4) & bitmask
+    x2 = (x >> 2) & bitmask
+    x3 = x & bitmask
+    x = torch.cat([x0, x1, x2, x3], -1)
+    
+    x = _decompress_nbits(x, scale, bits=2)
+    
+    return x
+
+
+
+def compress_flexible_nbits(x, bits, scale_method='max', scale_dims=(0,1)):
+    # support any bits
+    # CUDA only
+    
+    x, scale = _compress_nbits(x, bits=bits, scale_method=scale_method, scale_dims=scale_dims)
+    
+    x = pack_low_bit_tensor(x, bits)
+    
+    return x, scale
+
+
+def decompress_flexible_nbits(x, scale, bits, original_shape):
+    # support any bits, but need to know original_shape
+    # CUDA only
+    
+    x = unpack_low_bit_tensor(x, bits, original_shape)
+    
+    x = _decompress_nbits(x, scale, bits=bits)
+    
+    return x
+
+
+
+def compress_nbits(x, bits, scale_method='max', scale_dims=(0,1)):
+    if bits == 8:
+        x, scale = compress_8bit(x, scale_method=scale_method, scale_dims=scale_dims)
+    elif bits == 4:
+        x, scale = compress_4bit(x, scale_method=scale_method, scale_dims=scale_dims)
+    elif bits == 2:
+        x, scale = compress_2bit(x, scale_method=scale_method, scale_dims=scale_dims)
+    
+    return x, scale
+
+
+def decompress_nbits(x, scale, bits):
+    if bits == 8:
+        y = decompress_8bit(x, scale)
+    elif bits == 4:
+        y = decompress_4bit(x, scale)
+    elif bits == 2:
+        y = decompress_2bit(x, scale)
+    
+    return y
+
+def _compress_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512,
+                              stochastic=False, minimum_stochastic_distance=0.2):
+    
+    if bits == 1:
+        
+        x = x.view(bucket_size, -1)
+        
+        scale = (x.norm(dim=0) / (bucket_size**0.5)).half()
+        
+        x = (x >= 0)
+        
+        x = x.type(torch.uint8)
+        
+        return x, scale
+        
+    
+    fbits = bits - 1
+    
+    x = x.view(bucket_size, -1)
+    
+    if scale_method == 'max':
+        # issue: sensitive to outlier points
+        scale = x.abs().amax([0], keepdims=True)
+    elif scale_method == 'l2':
+        # ~95% confidence interval for normal distribution
+        scale = x.pow(2).mean([0], keepdims=True).sqrt() * 2 
+    else:
+        raise Exception('unkonwn scale method.')
+    # fp16 should be enough
+    scale = scale.half()
+    x = x / (scale + 1e-6)
+    
+    x = x.ldexp(torch.tensor(fbits))
+    clip_min = -(1<<fbits)
+    clip_max = (1<<fbits)-1
+
+    x = _rounding(x, stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
+    x = x.clip(clip_min, clip_max)
+    
+    x = x - clip_min
+    x = x.type(torch.uint8)
+    
+    return x, scale
+
+
+def compress_flexible_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512,
+                                      stochastic=False, minimum_stochastic_distance=0.2):
+    # support any bits
+    # CUDA only
+    
+    if bucket_size > x.numel():
+        bucket_size = x.numel()
+    
+    x, scale = _compress_nbits_by_bucket(
+        x, bits=bits, scale_method=scale_method, bucket_size=bucket_size,
+        stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance)
+    
+    x = pack_low_bit_tensor(x, bits)
+    
+    return x, scale
+
+
+def decompress_flexible_nbits_by_bucket(x, scale, bits, original_shape, bucket_size=512):
+    # support any bits, but need to know original_shape
+    # CUDA only
+        
+    
+    numel = math.prod(original_shape)
+    if bucket_size > numel:
+        bucket_size = numel
+        
+        
+    if bits == 1:
+
+        x = unpack_low_bit_tensor(x, bits, original_shape)
+        x = x.view(bucket_size, -1)
+        x = (x.half() - 0.5)*2
+        x = x * scale.unsqueeze(0)
+        x = x.view(original_shape)
+        
+        # print('done')
+        
+        return x
+    
+    x = unpack_low_bit_tensor(x, bits, original_shape)
+
+    x = x.view(bucket_size, -1)
+    x = _decompress_nbits(x, scale, bits=bits)
+    x = x.view(original_shape)
+    
+    return x
+
+if __name__=="__main__":
+
+    x = torch.randn((512, 512), dtype=torch.float32, device='cuda')
+
+    print("original")
+    print(x)
+    quantizer = Quantizer()
+    quantizer.configure(
+        4, perchannel=True, sym=False, mse=False
+    )
+    quantizer.find_params(x, weight=True)
+    b_q = quantizer.quantize(x)
+
+    packed_x, scale = compress_flexible_nbits(b_q, 4)
+    unpacked_x = decompress_flexible_nbits(packed_x,scale=scale, bits=4, original_shape=x.shape)
+
+    print("unpacked")
+    print(unpacked_x)
+    print(f"are they equal? {torch.allclose(x, unpacked_x)}")
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index 8f719e1..f57edb0 100644
--- a/gptq.py
+++ b/gptq.py
@@ -11,9 +11,18 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
+def hard_threshold(x, fraction_of_zero=0.1):
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    return mask * x
+
 class GPTQ:
     def __init__(self, layer):
         self.layer = layer
+        self.original_weight = layer.weight.data.clone()
         self.dev = self.layer.weight.device
         W = layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
@@ -26,26 +35,16 @@ def __init__(self, layer):
         self.nsamples = 0
 
     def add_batch(self, inp, out):
-        if DEBUG:
-            self.inp1 = inp
-            self.out1 = out
+        self.inp1 = inp
+        self.out1 = out
         if len(inp.shape) == 2:
             inp = inp.unsqueeze(0)
         tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+        if isinstance(self.layer, nn.Linear):
             if len(inp.shape) == 3:
                 inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
-        if isinstance(self.layer, nn.Conv2d):
-            unfold = nn.Unfold(
-                self.layer.kernel_size,
-                dilation=self.layer.dilation,
-                padding=self.layer.padding,
-                stride=self.layer.stride
-            )
-            inp = unfold(inp)
-            inp = inp.permute([1, 0, 2])
-            inp = inp.flatten(1)
+        
         self.H *= self.nsamples / (self.nsamples + tmp)
         self.nsamples += tmp
         # inp = inp.float()
@@ -54,7 +53,7 @@ def add_batch(self, inp, out):
         self.H += inp.matmul(inp.t())
 
     def fasterquant(
-        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False
+        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=-1
     ):
         W = self.layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
@@ -69,7 +68,8 @@ def fasterquant(
             self.quantizer.find_params(W, weight=True)
 
         H = self.H
-        del self.H
+        if write:
+            del self.H
         dead = torch.diag(H) == 0
         H[dead, dead] = 1
         W[:, dead] = 0
@@ -141,9 +141,15 @@ def fasterquant(
 
         if isinstance(self.layer, transformers.Conv1D):
             Q = Q.t()
-        self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
-        if DEBUG:
-            print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+        # here report the loss of the quantized layer vs. the original layer
+        new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype)
+        if sparsity >= 0:
+            sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=sparsity)
+        else:
+            sparsed_new_weight = new_weight
+        if write:
+            self.layer.weight.data = sparsed_new_weight
+        return torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
 
     def free(self):
         if DEBUG:
diff --git a/opt_delta.py b/opt_delta.py
index 8195606..38e3547 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -42,7 +42,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev):
     layers = model.model.decoder.layers
     delta_layers = delta_model.model.decoder.layers
 
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
     model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
     
     if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
@@ -506,13 +506,13 @@ def main(args):
             comp_time = time.time()-tick
         else:
             quantizers = opt_sequential(model, dataloader, DEV)
-    print(quantizers)
+    
     if args.delta and args.wbits<16:
         for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())):
-            if args.sparsify_hard_threshold:
-                print('Hard Thresholding...')
-                W = finetuned_p.data
-                finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+            # if args.sparsify_hard_threshold:
+            #     print('Hard Thresholding...')
+            #     W = finetuned_p.data
+            #     finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
             # if args.rank>0 and len(finetuned_p.shape) == 2:
             #     print('Finding Low Rank Approximation...')
             #     A = finetuned_p.data.float()
@@ -521,11 +521,15 @@ def main(args):
             #     finetuned_p.data =  A.half()
             #     num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
             num_params += torch.numel(finetuned_p.data)
-            # here we save a copy to pack, and save the delta only on disk
             finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+    
     if args.save_delta:
-        new_weights = model_packing(model, quantizers, bits=args.wbits)
-        torch.save(new_weights, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz")
+        new_weights, scale = model_packing(model, quantizers, bits=args.wbits)
+        torch.save({
+            'weight': new_weights,
+            'scale': scale,
+        }, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz")
+
     if args.benchmark:
         gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
         if len(gpus) > 1:
diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py
new file mode 100644
index 0000000..db2b8a3
--- /dev/null
+++ b/opt_delta_autotune.py
@@ -0,0 +1,632 @@
+import copy
+import time
+import json
+import torch
+import pickle
+from gptq import *
+from quant import *
+import torch.nn as nn
+from modelutils import *
+from loguru import logger
+from tensorio import TensorIO, model_packing
+from transformers import AutoTokenizer, AutoModel
+# from prettytable import PrettyTable
+
+def get_opt(model):
+    import torch
+
+    def skip(*args, **kwargs):
+        pass
+
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import OPTForCausalLM
+
+    # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    model.seqlen = model.config.max_position_embeddings
+    return model
+
+
+def hard_threshold(x, fraction_of_zero=0.1):
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    return mask * x
+
+
+@torch.no_grad()
+def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.1):
+    search_space = {
+        "wbits": [2,3,4],
+        "sparsities": [-1, 0.33, 0.5, 0.67, 0.9]
+    }
+    base_floats = 16
+    compression_rates = {}
+    for wbit in search_space['wbits']:
+        for sparsity in search_space['sparsities']:
+            compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) if sparsity >=0 else base_floats / wbit
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    delta_layers = delta_model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {"i": 0, "attention_mask": None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            raise ValueError
+
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+
+    print("Ready.")
+    tuned_params = {}
+    tuned_configs = {}
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
+        subset = find_layers(layer)
+        for name in subset:
+            tuned_params[f'{i}_{name}'] = {}
+            tuned_configs[f'{i}_{name}'] = {}
+            for wbit in search_space['wbits']:
+                for sparsity in search_space['sparsities']:
+                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = {
+                        'gptq': GPTQ(subset[name])
+                    }
+
+                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer = Quantizer()
+
+                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer.configure(
+                        wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+                    )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                for wbit in search_space['wbits']:
+                    for sparsity in search_space['sparsities']:
+                        tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].add_batch(inp[0].data, out.data)
+            return tmp
+
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(
+                inps[j].unsqueeze(0), attention_mask=attention_mask
+            )[0]
+
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            logger.info(f"Quantizing {i}.{name} ...")
+            for wbit in search_space['wbits']:
+                for sparsity in search_space['sparsities']:
+                    loss=tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].fasterquant(
+                        percdamp=args.percdamp,
+                        groupsize=args.groupsize,
+                        actorder=args.act_order,
+                        write=False
+                    )
+                    tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = {
+                        'loss': loss.item()
+                    }
+                    logger.info(f"wbit: {wbit}; sparsity: {sparsity}; loss: {loss}")
+            # within the tol, pick the minimal wbit and maximal sparsity
+            best_wbit = None
+            best_sparsity = None
+            best_loss = None
+            # starting from the minimal compression rate
+            compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True)
+            # loop through all compression rates:
+            for cr in compression_rates:
+                config = cr[0]
+                wbit = int(config.split('_')[0].split('.')[1])
+                sparsity = float(config.split('_')[1].replace('sparsity.',''))
+                # find the corresponding loss
+                loss = tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['loss']
+                # if the loss is within the tolerance
+                if loss <= tol:
+                    best_wbit = wbit
+                    best_sparsity = sparsity
+                    break
+            # if not, pick the lowest compression rate
+            if best_wbit is None:
+                best_wbit = int(compression_rates[-1][0].split('_')[0].split('.')[1])
+                best_sparsity = float(compression_rates[-1][0].split('_')[1].replace('sparsity.',''))
+            
+            best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss']
+            # redo the actual work
+            logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...")
+            tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].fasterquant(
+                percdamp=args.percdamp,
+                groupsize=args.groupsize,
+                actorder=args.act_order,
+                write=True
+            )
+            quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{sparsity}']['gptq'].quantizer
+            tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].free()
+            tuned_configs[f'{i}_{name}']['choice'] = {
+                'best_wbit': best_wbit,
+                'best_sparsity': best_sparsity,
+                'best_loss': best_loss
+            }
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(
+                inps[j].unsqueeze(0), attention_mask=attention_mask
+            )[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        for key in tuned_params.keys():
+            if key.startswith(f'{i}_'):
+                for wbit in search_space['wbits']:
+                    for sparsity in search_space['sparsities']:
+                        del tuned_params[key][f'wbit.{wbit}_sparsity.{sparsity}']['gptq']
+        torch.cuda.empty_cache()
+
+        inps, outs = original_outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers, tuned_configs
+
+@torch.no_grad()
+def opt_eval(model, testenc, dev):
+    print("Evaluating ...")
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {"i": 0, "attention_mask": None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            raise ValueError
+
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+
+    for i in range(len(layers)):
+        # print(i)
+        layer = layers[i].to(dev)
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.decoder.final_layer_norm is not None:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(
+            dev
+        )
+    if model.model.decoder.project_out is not None:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.decoder.final_layer_norm is not None:
+            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+        if model.model.decoder.project_out is not None:
+            hidden_states = model.model.decoder.project_out(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+
+    model.config.use_cache = use_cache
+    return ppl.item()
+
+
+# TODO: perform packing on GPU
+def opt_pack3(model, quantizers):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant3(model, quantizers, faster=args.faster_kernel)
+    qlayers = find_layers(model, [Quant3Linear])
+    print("Packing ...")
+    for name in qlayers:
+        print(name)
+        quantizers[name] = quantizers[name].cpu()
+        qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
+    print("Done.")
+    return model
+
+
+def load_quant3(model, checkpoint):
+    from transformers import OPTConfig, OPTForCausalLM
+
+    config = OPTConfig.from_pretrained(model)
+
+    def noop(*args, **kwargs):
+        pass
+
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = OPTForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ["model.decoder.project_out", "model.decoder.project_in", "lm_head"]:
+        if name in layers:
+            del layers[name]
+    make_quant3(model, layers, faster=args.faster_kernel)
+
+    print("Loading model ...")
+    model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = model.config.max_position_embeddings
+    print("Done.")
+
+    return model
+
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else DEV)
+    torch.cuda.synchronize()
+    cache = {"past": None}
+
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache["past"]:
+                cache["past"][i] = None
+
+        return tmp
+
+    for i, layer in enumerate(model.model.decoder.layers):
+        layer.register_forward_hook(clear_past(i))
+
+    print("Benchmarking ...")
+
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.0
+
+    def sync():
+        if hasattr(model, "gpus"):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+            out = model(
+                input_ids[:, i].reshape(-1),
+                past_key_values=cache["past"],
+                attention_mask=attention_mask[:, : (i + 1)].reshape((1, -1)),
+            )
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            if check and i != input_ids.numel() - 1:
+                tot += loss(
+                    out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)
+                ).float()
+            cache["past"] = list(out.past_key_values)
+            del out
+        sync()
+        import numpy as np
+
+        print("Median:", np.median(times))
+        if check:
+            print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item())
+
+
+def main(args):
+    print(args)
+    num_params = 0
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        if args.delta and args.wbits < 16:
+            model = get_opt(args.model)
+            model.eval()
+            base_model = get_opt(args.base_model)
+            base_model.eval()
+            dataloader, testloader = get_loaders(
+                args.dataset,
+                nsamples=args.nsamples,
+                seed=args.seed,
+                model=args.model,
+                seqlen=model.seqlen,
+            )
+            original_finetuned_model = copy.deepcopy(model)
+            for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+                finetuned_p.data = (finetuned_p.data - base_p.data).clone()
+        else:
+            model = get_opt(args.model)
+            model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset,
+        nsamples=args.nsamples,
+        seed=args.seed,
+        model=args.model,
+        seqlen=model.seqlen,
+    )
+
+    if args.wbits < 16:
+        if args.delta:
+            tick = time.time()
+            quantizers, tuned_params = opt_sequential_delta(
+                original_finetuned_model, model, dataloader, DEV
+            )
+            with open(".cache/tuned_params.json", "w+") as f:
+                json.dump(tuned_params, f)
+            comp_time = time.time() - tick
+        else:
+            raise NotImplementedError
+    if args.delta and args.wbits < 16:
+        for idx, (base_p, finetuned_p) in enumerate(
+            zip(base_model.parameters(), model.parameters())
+        ):
+            num_params += torch.numel(finetuned_p.data)
+            finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+
+    if args.save_delta:
+        new_weights, scale = model_packing(model, quantizers, bits=args.wbits)
+        torch.save(
+            {
+                "weight": new_weights,
+                "scale": scale,
+            },
+            f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz",
+        )
+
+    if args.benchmark:
+        model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, : args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+
+    dataset = args.dataset
+    dataloader, testloader = get_loaders(
+        dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    ppl = opt_eval(model, testloader, DEV)
+    print(ppl)
+
+    if args.save_hf:
+        if args.delta:
+            hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune"
+        else:
+            hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits"
+        model.save_pretrained(hf_path)
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        tokenizer.save_pretrained(hf_path)
+    else:
+        opt_pack3(model, quantizers)
+        torch.save(model.state_dict(), args.save)
+
+
+if __name__ == "__main__":
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="lnair/opt-1.3b-wikitext2",
+        help="OPT model to load; pass `facebook/opt-X`.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        choices=["wikitext2", "ptb", "c4"],
+        default="wikitext2",
+        help="Where to extract calibration data from.",
+    )
+    parser.add_argument(
+        "--base-model",
+        type=str,
+        default="facebook/opt-1.3b",
+        help="base OPT model to load",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=0, help="Seed for sampling the calibration data."
+    )
+    parser.add_argument(
+        "--nsamples", type=int, default=128, help="Number of calibration data samples."
+    )
+    parser.add_argument(
+        "--percdamp",
+        type=float,
+        default=0.01,
+        help="Percent of the average Hessian diagonal to use for dampening.",
+    )
+    parser.add_argument(
+        "--wbits",
+        type=int,
+        default=2,
+        choices=[2, 3, 4, 16],
+        help="#bits to use for quantization; use 16 for evaluating base model.",
+    )
+    parser.add_argument(
+        "--trits", action="store_true", help="Whether to use trits for quantization."
+    )
+    parser.add_argument(
+        "--groupsize",
+        type=int,
+        default=-1,
+        help="Groupsize to use for quantization; default uses full row.",
+    )
+    parser.add_argument(
+        "--sym", action="store_true", help="Whether to perform symmetric quantization."
+    )
+    parser.add_argument(
+        "--save",
+        type=str,
+        default="",
+        help="Save quantized checkpoint under this name.",
+    )
+    parser.add_argument("--load", type=str, default="", help="Load quantized model.")
+    parser.add_argument(
+        "--benchmark",
+        type=int,
+        default=0,
+        help="Number of tokens to use for benchmarking.",
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Whether to compute perplexity during benchmarking for verification.",
+    )
+    parser.add_argument(
+        "--new-eval",
+        action="store_true",
+        help="Whether to use the new PTB and C4 eval.",
+    )
+    parser.add_argument(
+        "--faster-kernel",
+        action="store_true",
+        help="Whether to use the new faster kernel for benchmarking.",
+    )
+    parser.add_argument(
+        "--act-order",
+        action="store_true",
+        help="Whether to apply the activation order GPTQ heuristic",
+    )
+    parser.add_argument(
+        "--delta", action="store_true", help="Whether to use delta compression"
+    )
+    parser.add_argument(
+        "--sparsify_hard_threshold", action="store_true", help="Whether to add sparsity"
+    )
+    parser.add_argument(
+        "--save-hf",
+        action="store_true",
+        default=False,
+        help="Whether to save a huggingface model",
+    )
+    parser.add_argument(
+        "--save-delta",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--fraction_of_zero", type=float, default=0.99, help="Sparsity ratio"
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=0,
+        help="The rank to use for decomposing each matrices",
+    )
+    args = parser.parse_args()
+
+    # results = PrettyTable()
+
+    main(args)
+
+    print("finished.")
diff --git a/pack_utils_test.py b/pack_utils_test.py
index a4130e3..58f1ad0 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -5,7 +5,7 @@
 from safetensors.torch import save_file
 from opt_delta import hard_threshold
 
-QUANTIZED_BITS = 3
+QUANTIZED_BITS = 4
 
 if __name__=="__main__":
     """
diff --git a/playground.py b/playground.py
index ebeecac..60557c9 100644
--- a/playground.py
+++ b/playground.py
@@ -1,90 +1,20 @@
-import torch
-import numpy as np
-import torch.nn as nn
-
-def quantize(x, scale, zero, maxq):
-    if maxq < 0:
-        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
-    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
-    return scale * (q - zero)
-
-class Quantizer(nn.Module):
-
-    def __init__(self, shape=1):
-        super(Quantizer, self).__init__()
-        self.register_buffer('maxq', torch.tensor(0))
-        self.register_buffer('scale', torch.zeros(shape))
-        self.register_buffer('zero', torch.zeros(shape))
-
-    def configure(
-        self,
-        bits, perchannel=False, sym=True, 
-        mse=False, norm=2.4, grid=100, maxshrink=.8,
-    ):
-        self.maxq = torch.tensor(2 ** bits - 1)
-        self.perchannel = perchannel
-        self.sym = sym
-        self.mse = mse
-        self.norm = norm
-        self.grid = grid
-        self.maxshrink = maxshrink 
-
-    def find_params(self, x, weight=False):
-        dev = x.device
-        self.maxq = self.maxq.to(dev)
-        shape = x.shape
-        if self.perchannel:
-            if weight:
-                x = x.flatten(1)
-
-        tmp = torch.zeros(x.shape[0], device=dev)
-        xmin = torch.minimum(x.min(1)[0], tmp)
-        xmax = torch.maximum(x.max(1)[0], tmp)
-
-        if self.sym:
-            xmax = torch.maximum(torch.abs(xmin), xmax)
-            tmp = xmin < 0
-            if torch.any(tmp):
-                xmin[tmp] = -xmax[tmp]
-        tmp = (xmin == 0) & (xmax == 0)
-        xmin[tmp] = -1
-        xmax[tmp] = +1
-
-        if self.maxq < 0:
-          self.scale = xmax
-          self.zero = xmin
-        else:
-          self.scale = (xmax - xmin) / self.maxq
-          if self.sym:
-              self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
-          else:
-              self.zero = torch.round(-xmin / self.scale)
-        
-        if weight:
-            shape = [-1] + [1] * (len(shape) - 1)
-            self.scale = self.scale.reshape(shape)
-            self.zero = self.zero.reshape(shape)
-            return
-
-    def quantize(self, x):
-        if self.ready():
-            return quantize(x, self.scale, self.zero, self.maxq)
-        return x
-
-    def enabled(self):
-        return self.maxq > 0
-
-    def ready(self):
-        return torch.all(self.scale != 0)
-
-q = torch.tensor([[1,2,3,4,5,6,7,8]]).float()
-quantizer = Quantizer()
-quantizer.configure(
-    3, perchannel=True, sym=False, mse=False
-)
-quantizer.find_params(q, weight=True)
-b_q = quantizer.quantize(q)
-# now since b_q is 8 3-bit floats, we can pack them into 3 8-bit integers
-packed_b_q = torch.zeros(3, dtype=torch.uint8)
-for i in range(3):
-    packed_b_q[i] = b_q[0][i*8:(i+1)*8].byte().sum()
+base_floats = 16
+search_space = {
+    "wbits": [2,3,4],
+    "sparsities": [-1, 0.33, 0.5, 0.67, 0.9]
+}
+compression_rates = {}
+for wbit in search_space['wbits']:
+    for sparsity in search_space['sparsities']:
+        if sparsity == -1:
+            sparsity = 0
+        compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity)
+
+compression_rates = sorted(compression_rates.items(), key=lambda x: x[1])
+
+for cr in compression_rates:
+    config = cr[0]
+    print(config)
+    wbit = int(config.split('_')[0].split('.')[1])
+    sparsity = float(config.split('_')[1].replace('sparsity.',''))
+    print(f'wbit: {wbit}, sparsity: {sparsity}, compression rate: {cr[1]}')
\ No newline at end of file
diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh
index f397417..cb93a55 100644
--- a/scripts/gptq_delta.sh
+++ b/scripts/gptq_delta.sh
@@ -1,5 +1,5 @@
 ts -S 8
-CUDA_VISIBLE_DEVICES=0 python opt_delta.py \
+CUDA_VISIBLE_DEVICES=0 python opt_delta_autotune.py \
     --dataset wikitext2 \
     --wbits 2 \
     --delta \
diff --git a/tensorio.py b/tensorio.py
index 26b5fab..46cdd2f 100644
--- a/tensorio.py
+++ b/tensorio.py
@@ -4,6 +4,7 @@
 from safetensors.torch import save_model
 from modelutils import find_layers
 from pack_utils import pack_to_bits
+from compress_utils import compress_flexible_nbits, decompress_flexible_nbits
 
 class TensorIO():
     def __init__(self, format: str, tensors=None) -> None:
@@ -12,6 +13,7 @@ def __init__(self, format: str, tensors=None) -> None:
             self.tensors = {}
         else:
             self.tensors = tensors
+
     def add_tensor(self, idx, tensor):
         tensor = tensor.flatten()
         # assume that the tensor is sparse
@@ -20,7 +22,7 @@ def add_tensor(self, idx, tensor):
         self.tensors[f"{idx}_indices"] = indices
         self.tensors[f"{idx}_values"] = values
         self.tensors[f"{idx}_size"] = torch.tensor(tensor.size())
-    
+
     def to_disk(self, path):
         torch.save(self.tensors, path)
 
@@ -33,7 +35,7 @@ def from_disk(cls, path):
         # restore the tensors
         for key in tensors.keys():
             m = torch.zeros(math.prod(tensors[f"{key}_size"]), dtype=tensors[f'{key}_values'].dtype)
-    
+
             m[tensors[f"{key}_indices"]] = tensors[f"{key}_values"]
             tensors[f"{key}_size"] = tensors[f"{key}_size"].tolist()
             m = m.reshape(tensors[f"{key}_size"])
@@ -41,14 +43,14 @@ def from_disk(cls, path):
             tensors[key] = m
         return cls('sparse', tensors=tensors)
 
-def model_packing(model, quantizers, bits):
+def model_packing(model, quantizers, bits, reformat='none'):
     layers = find_layers(model)
     layers = {n: layers[n] for n in quantizers}
     qlayers = find_layers(model, )
     print('Packing ...')
-    new_weights = {}
     for name in qlayers:
-        quantizers[name] = quantizers[name].cpu()
-        new_weights[name] = pack_to_bits(layers[name].weight.data, quantizers[name], bits, groupsize=-1)
-    print('Done.')
-    return new_weights
\ No newline at end of file
+        if name in quantizers:
+            quantizers[name] = quantizers[name].cpu()
+            x, scale = compress_flexible_nbits(layers[name].weight.data.cuda(), bits)
+    return x, scale
+

From dc05d074f0568f13afd92b16e90558cbce9761ab Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Fri, 12 May 2023 15:06:12 +0000
Subject: [PATCH 19/23] auto tuning

---
 gptq.py                  |  20 ++++--
 opt_delta_autotune.py    |  61 +++++++++--------
 replay.py                |   0
 scripts/gptq_delta.sh    |   4 +-
 scripts/playground.ipynb | 143 ++-------------------------------------
 5 files changed, 50 insertions(+), 178 deletions(-)
 create mode 100644 replay.py

diff --git a/gptq.py b/gptq.py
index f57edb0..f798eda 100644
--- a/gptq.py
+++ b/gptq.py
@@ -3,7 +3,7 @@
 import torch
 import transformers
 import torch.nn as nn
-
+from loguru import logger
 from quant import quantize
 
 DEBUG = False 
@@ -16,7 +16,7 @@ def hard_threshold(x, fraction_of_zero=0.1):
     num_params = torch.numel(x)
     thresh_index = int(num_params * fraction_of_zero)
     threshold = y[thresh_index]
-    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor)
     return mask * x
 
 class GPTQ:
@@ -53,7 +53,7 @@ def add_batch(self, inp, out):
         self.H += inp.matmul(inp.t())
 
     def fasterquant(
-        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=-1
+        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None
     ):
         W = self.layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
@@ -143,13 +143,19 @@ def fasterquant(
             Q = Q.t()
         # here report the loss of the quantized layer vs. the original layer
         new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype)
-        if sparsity >= 0:
-            sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=sparsity)
-        else:
+        losses = {}
+        if sparsity is None:
             sparsed_new_weight = new_weight
+            losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+        else:
+            for s_sity in sparsity:
+                if write:
+                    logger.info(f"HT with: sparsity={s_sity}")
+                sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity)
+                losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
         if write:
             self.layer.weight.data = sparsed_new_weight
-        return torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+        return losses
 
     def free(self):
         if DEBUG:
diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py
index db2b8a3..02f6334 100644
--- a/opt_delta_autotune.py
+++ b/opt_delta_autotune.py
@@ -39,16 +39,18 @@ def hard_threshold(x, fraction_of_zero=0.1):
 
 
 @torch.no_grad()
-def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.1):
+def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2):
     search_space = {
         "wbits": [2,3,4],
-        "sparsities": [-1, 0.33, 0.5, 0.67, 0.9]
+        "sparsities": [0.0, 0.33, 0.5, 0.67, 0.9, 0.95]
     }
     base_floats = 16
     compression_rates = {}
     for wbit in search_space['wbits']:
         for sparsity in search_space['sparsities']:
             compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) if sparsity >=0 else base_floats / wbit
+    compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True)
+    
     use_cache = model.config.use_cache
     model.config.use_cache = False
     layers = model.model.decoder.layers
@@ -113,22 +115,20 @@ def forward(self, inp, **kwargs):
             tuned_params[f'{i}_{name}'] = {}
             tuned_configs[f'{i}_{name}'] = {}
             for wbit in search_space['wbits']:
-                for sparsity in search_space['sparsities']:
-                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = {
-                        'gptq': GPTQ(subset[name])
-                    }
+                tuned_params[f'{i}_{name}'][f'wbit.{wbit}'] = {
+                    'gptq': GPTQ(subset[name])
+                }
 
-                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer = Quantizer()
+                tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer = Quantizer()
 
-                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer.configure(
-                        wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits
-                    )
+                tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer.configure(
+                    wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+                )
 
         def add_batch(name):
             def tmp(_, inp, out):
                 for wbit in search_space['wbits']:
-                    for sparsity in search_space['sparsities']:
-                        tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].add_batch(inp[0].data, out.data)
+                    tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].add_batch(inp[0].data, out.data)
             return tmp
 
         handles = []
@@ -147,23 +147,23 @@ def tmp(_, inp, out):
         for name in subset:
             logger.info(f"Quantizing {i}.{name} ...")
             for wbit in search_space['wbits']:
-                for sparsity in search_space['sparsities']:
-                    loss=tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].fasterquant(
-                        percdamp=args.percdamp,
-                        groupsize=args.groupsize,
-                        actorder=args.act_order,
-                        write=False
-                    )
-                    tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = {
-                        'loss': loss.item()
+                losses=tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant(
+                    percdamp=args.percdamp,
+                    groupsize=args.groupsize,
+                    actorder=args.act_order,
+                    sparsity = search_space['sparsities'],
+                    write=False,
+                )
+                for s_sity in losses.keys():
+                    tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{s_sity}'] = {
+                        'loss': losses[s_sity].item()
                     }
-                    logger.info(f"wbit: {wbit}; sparsity: {sparsity}; loss: {loss}")
+                    logger.info(f"wbit: {wbit}; sparsity: {s_sity}; loss: {losses[s_sity].item()}")
             # within the tol, pick the minimal wbit and maximal sparsity
             best_wbit = None
             best_sparsity = None
             best_loss = None
             # starting from the minimal compression rate
-            compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True)
             # loop through all compression rates:
             for cr in compression_rates:
                 config = cr[0]
@@ -180,18 +180,20 @@ def tmp(_, inp, out):
             if best_wbit is None:
                 best_wbit = int(compression_rates[-1][0].split('_')[0].split('.')[1])
                 best_sparsity = float(compression_rates[-1][0].split('_')[1].replace('sparsity.',''))
-            
+            if best_sparsity == -1:
+                best_sparsity = -1
             best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss']
             # redo the actual work
             logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...")
-            tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].fasterquant(
+            tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant(
                 percdamp=args.percdamp,
                 groupsize=args.groupsize,
                 actorder=args.act_order,
-                write=True
+                write=True,
+                sparsity = [best_sparsity],
             )
-            quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{sparsity}']['gptq'].quantizer
-            tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].free()
+            quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].quantizer
+            tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].free()
             tuned_configs[f'{i}_{name}']['choice'] = {
                 'best_wbit': best_wbit,
                 'best_sparsity': best_sparsity,
@@ -208,8 +210,7 @@ def tmp(_, inp, out):
         for key in tuned_params.keys():
             if key.startswith(f'{i}_'):
                 for wbit in search_space['wbits']:
-                    for sparsity in search_space['sparsities']:
-                        del tuned_params[key][f'wbit.{wbit}_sparsity.{sparsity}']['gptq']
+                    del tuned_params[key][f'wbit.{wbit}']['gptq']
         torch.cuda.empty_cache()
 
         inps, outs = original_outs, inps
diff --git a/replay.py b/replay.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh
index cb93a55..0795956 100644
--- a/scripts/gptq_delta.sh
+++ b/scripts/gptq_delta.sh
@@ -2,9 +2,9 @@ ts -S 8
 CUDA_VISIBLE_DEVICES=0 python opt_delta_autotune.py \
     --dataset wikitext2 \
     --wbits 2 \
+    --base-model facebook/opt-2.7b \
+    --model lnair/opt-2.7b-wikitext2 \
     --delta \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.95 \
     --save-delta \
     --save-hf \
     --groupsize 1024
diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb
index 9939233..1751996 100644
--- a/scripts/playground.ipynb
+++ b/scripts/playground.ipynb
@@ -1,150 +1,15 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "seed=42\n",
-    "target_model_name = \"lnair/opt-1.3b-wikitext2\"\n",
-    "base_model_name = \"facebook/opt-1.3b\"\n",
-    "n_samples = 128\n",
-    "dataset = 'wikitext2'\n",
-    "sys.path.append('..')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "OPTForCausalLM(\n",
-       "  (model): OPTModel(\n",
-       "    (decoder): OPTDecoder(\n",
-       "      (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n",
-       "      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n",
-       "      (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "      (layers): ModuleList(\n",
-       "        (0-23): 24 x OPTDecoderLayer(\n",
-       "          (self_attn): OPTAttention(\n",
-       "            (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "          )\n",
-       "          (activation_fn): ReLU()\n",
-       "          (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "          (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
-       "          (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
-       "          (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n",
-       ")"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from modelutils import get_opt\n",
-    "base_model = get_opt(base_model_name)\n",
-    "target_model = get_opt(target_model_name)\n",
-    "base_model.to('cuda')\n",
-    "target_model.to('cuda')\n",
-    "base_model.eval()\n",
-    "target_model.eval()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n",
-      "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n"
-     ]
-    }
-   ],
-   "source": [
-    "from datautils import get_loaders\n",
-    "trainloader, loader_enc = get_loaders(\n",
-    "    dataset,\n",
-    "    nsamples = n_samples,\n",
-    "    seed=seed,\n",
-    "    model=target_model_name,\n",
-    "    seqlen=base_model.seqlen,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cli import quantize_with_lowrank\n",
-    "r_quantizer, l_quantizer, lr_tensors = quantize_with_lowrank(\n",
-    "    base_model,\n",
-    "    target_model,\n",
-    "    trainloader,\n",
-    "    32\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
-    }
-   ],
    "source": [
-    "import torch\n",
-    "from safetensors import safe_open\n",
-    "from safetensors.torch import save_file\n",
-    "\n",
-    "# iterate over all keys in lr_tensors\n",
-    "for k in lr_tensors.keys():\n",
-    "    lr_tensors[k] = lr_tensors[k].contiguous() # make sure they are contiguous\n",
-    "# save them to a file\n",
-    "\n",
-    "save_file(lr_tensors, \"model.safetensors\")"
+    "import json\n",
+    "with open(\"../.cache/tuned_params.json\", \"r\") as fp:\n",
+    "    data = json.load(fp)\n",
+    "print(data.keys())"
    ]
   }
  ],

From 7ce5f6cc625cbca0b8759e79124fdaea6e739528 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Sat, 13 May 2023 18:14:36 +0000
Subject: [PATCH 20/23] packing, bitmap, etc....

---
 .gitignore                              |  3 +-
 docs/number.md                          | 20 ++++++
 gptq.py                                 | 20 ++++--
 opt_delta_autotune.py                   | 21 +++---
 playground.py                           | 90 ++++++++++++++++++++-----
 scripts/gptq_delta.sh                   | 69 ++-----------------
 scripts/playground.ipynb                |  7 +-
 submit.py                               | 24 ++-----
 utilities/analyze.py                    | 16 +++++
 utilities/compression_rate_estimator.py | 53 +++++++++++++++
 utilities/tuning_analyser.py            |  6 ++
 11 files changed, 208 insertions(+), 121 deletions(-)
 create mode 100644 utilities/analyze.py
 create mode 100644 utilities/compression_rate_estimator.py
 create mode 100644 utilities/tuning_analyser.py

diff --git a/.gitignore b/.gitignore
index 87c07aa..f73b20e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ outputs/
 outputs_past/
 packed_delta
 .cache
-delta_outputs/
\ No newline at end of file
+delta_outputs/
+.io/
\ No newline at end of file
diff --git a/docs/number.md b/docs/number.md
index e69de29..ec5bac9 100644
--- a/docs/number.md
+++ b/docs/number.md
@@ -0,0 +1,20 @@
+In theory:
+    With a matrix of size 2048 * 2048, 10% elements are non-zero. The original bits is 2048 * 2048 * 16 = 16 * 4M
+
+    To store the indices of non-zero elements, it takes 2048 * 2048 * 10% * log2(2048 * 2048) ~= 2.2 * 4M
+
+    Considering indices only, we achieve 16 / 2.2 ~= 7.3x compression ratio
+
+In practice:
+    Saving a matrix of size 2048 * 2048, 10% elements are non-zero takes 8M bytes on disk (with torch.save).
+
+    Saving packed indices takes 1.9M on disk, achieving 17 / 1.9 ~= 8.9x compression ratio.
+
+    With zip, the packed indices takes 1.1M on disk, achieving 17 / 1.1 ~= 15.5x compression ratio.
+
+
+256 x 256 -> 64k fp 16 -> 128k on disk ok.
+log2(2048*2048) = 22. 3 int8 for each index. 
+
+0.4M * 3 = 1.2M
+"""
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index f798eda..34e987d 100644
--- a/gptq.py
+++ b/gptq.py
@@ -6,12 +6,14 @@
 from loguru import logger
 from quant import quantize
 
-DEBUG = False 
+DEBUG = False
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
 def hard_threshold(x, fraction_of_zero=0.1):
+    if fraction_of_zero == 0:
+        return x
     y, _ = torch.sort(x.view(-1).abs().clone())
     num_params = torch.numel(x)
     thresh_index = int(num_params * fraction_of_zero)
@@ -124,15 +126,16 @@ def fasterquant(
             W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
 
             if DEBUG:
-                self.layer.weight.data[:, :i2] = Q[:, :i2]
-                self.layer.weight.data[:, i2:] = W[:, i2:]
-                print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-                print(torch.sum(Losses))
+                pass
+                #self.layer.weight.data[:, :i2] = Q[:, :i2]
+                #self.layer.weight.data[:, i2:] = W[:, i2:]
+                #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                #print(torch.sum(Losses))
 
         torch.cuda.synchronize()
         total_time = time.time() - tick
         # print('time %.2f' % total_time)
-        error = torch.sum(Losses).item()
+        # error = torch.sum(Losses).item()
         # print('error', error)
 
         if actorder:
@@ -153,6 +156,11 @@ def fasterquant(
                     logger.info(f"HT with: sparsity={s_sity}")
                 sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity)
                 losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+                if losses[s_sity] > 100:
+                    logger.info(f"{sparsed_new_weight}")
+                    logger.info(f"{new_weight}")
+                    logger.info(f"{sparsed_new_weight.shape}")
+                    logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}")
         if write:
             self.layer.weight.data = sparsed_new_weight
         return losses
diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py
index 02f6334..4e2911e 100644
--- a/opt_delta_autotune.py
+++ b/opt_delta_autotune.py
@@ -37,7 +37,6 @@ def hard_threshold(x, fraction_of_zero=0.1):
     mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
     return mask * x
 
-
 @torch.no_grad()
 def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2):
     search_space = {
@@ -48,7 +47,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2):
     compression_rates = {}
     for wbit in search_space['wbits']:
         for sparsity in search_space['sparsities']:
-            compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) if sparsity >=0 else base_floats / wbit
+            compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity)
     compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True)
     
     use_cache = model.config.use_cache
@@ -163,7 +162,7 @@ def tmp(_, inp, out):
             best_wbit = None
             best_sparsity = None
             best_loss = None
-            # starting from the minimal compression rate
+            # starting from the maximal compression rate
             # loop through all compression rates:
             for cr in compression_rates:
                 config = cr[0]
@@ -183,7 +182,7 @@ def tmp(_, inp, out):
             if best_sparsity == -1:
                 best_sparsity = -1
             best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss']
-            # redo the actual work
+            # redo the actual work, and write to the layer
             logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...")
             tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant(
                 percdamp=args.percdamp,
@@ -456,9 +455,9 @@ def main(args):
         if args.delta:
             tick = time.time()
             quantizers, tuned_params = opt_sequential_delta(
-                original_finetuned_model, model, dataloader, DEV
+                original_finetuned_model, model, dataloader, DEV, args.tol
             )
-            with open(".cache/tuned_params.json", "w+") as f:
+            with open(f".cache/{args.model.replace('/', '.')}_delta_tol={args.tol}.json", "w+") as f:
                 json.dump(tuned_params, f)
             comp_time = time.time() - tick
         else:
@@ -498,9 +497,9 @@ def main(args):
 
     if args.save_hf:
         if args.delta:
-            hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune"
+            hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}"
         else:
-            hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits"
+            hf_path = f"outputs/{args.model.replace('/', '.')}_autotuned_tol={args.tol}"
         model.save_pretrained(hf_path)
         tokenizer = AutoTokenizer.from_pretrained(args.model)
         tokenizer.save_pretrained(hf_path)
@@ -534,6 +533,12 @@ def main(args):
         default="facebook/opt-1.3b",
         help="base OPT model to load",
     )
+    parser.add_argument(
+        "--tol",
+        type=float,
+        default=0.2,
+        help="Tolerance of the loss per layer",
+    )
     parser.add_argument(
         "--seed", type=int, default=0, help="Seed for sampling the calibration data."
     )
diff --git a/playground.py b/playground.py
index 60557c9..3b15c6d 100644
--- a/playground.py
+++ b/playground.py
@@ -1,20 +1,74 @@
-base_floats = 16
-search_space = {
-    "wbits": [2,3,4],
-    "sparsities": [-1, 0.33, 0.5, 0.67, 0.9]
-}
-compression_rates = {}
-for wbit in search_space['wbits']:
-    for sparsity in search_space['sparsities']:
-        if sparsity == -1:
-            sparsity = 0
-        compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity)
+import math
+import torch
+import numpy as np
+import torchvision.transforms as T
 
-compression_rates = sorted(compression_rates.items(), key=lambda x: x[1])
+def bin_array(num, m):
+    """Convert a positive integer num into an m-bit bit vector"""
+    return np.array(list(np.binary_repr(num).zfill(m))).astype(np.int8)
 
-for cr in compression_rates:
-    config = cr[0]
-    print(config)
-    wbit = int(config.split('_')[0].split('.')[1])
-    sparsity = float(config.split('_')[1].replace('sparsity.',''))
-    print(f'wbit: {wbit}, sparsity: {sparsity}, compression rate: {cr[1]}')
\ No newline at end of file
+def hard_threshold(x, fraction_of_zero=0.1):
+    if fraction_of_zero == 0:
+        return x
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    transform = T.ToPILImage()
+
+    # convert the tensor to PIL image using above transform
+    binmask = transform(mask)
+    binmask = binmask.convert('1')
+    binmask.save('.io/binmask.bmp')
+    return mask * x
+
+def packing_indices(x):
+    matrix_size = x.shape[0] * x.shape[1]
+    y = torch.zeros(x.shape)
+    y = y.flatten()
+    # find indices of non-zero elements
+    x = x.clone().flatten()
+    indices = torch.nonzero(x)
+    # assume matrix is a power of 2
+    bit_width = int(math.log2(matrix_size))
+    # turn into a python tensor with boolean values
+    indices_binary = torch.tensor(np.array([bin_array(i, bit_width) for i in indices]))
+    packed_indices = torch.tensor(np.packbits(indices_binary, axis=1), dtype=torch.uint8)
+    return packed_indices
+
+def unpacking_indices(packed_indices):
+    # unpack with numpy
+    unpacked_indices = np.unpackbits(packed_indices, axis=1)
+    # convert bits back to indices
+    unpacked_indices = torch.tensor(np.array([int("".join(map(str, i)), 2) for i in unpacked_indices]))
+    return unpacked_indices
+
+def compression_rate_calc(msize, wbit, sparsity):
+    original_bit_used = msize * 16
+    nonzeros = msize * sparsity
+    to_store_value = nonzeros * wbit
+    to_store_index = nonzeros * math.log2(msize)
+    print("original_bit_used: ", original_bit_used)
+    print("to_store_value: ", to_store_value)
+    print("to_store_index: ", to_store_index)
+    print("compression rate: ", original_bit_used / (to_store_value + to_store_index))
+    return original_bit_used, to_store_value, to_store_index
+
+if __name__=="__main__":
+    base_floats = 16
+    wbits = 3
+    m_size = 2048
+    nonsparsity = 0.9
+    x = torch.randn((m_size, m_size), dtype=torch.float16)
+    torch.save(x, ".io/x.pt")
+    x = hard_threshold(x, nonsparsity)
+    # 10% x 4M indices -> 800k on disk
+    packed_indices = packing_indices(x)
+    print(packed_indices.shape)
+    print(packed_indices.shape)
+    print(packed_indices.dtype)
+    torch.save(packed_indices, ".io/packed_indices.pt")
+    unpacked_indices = unpacking_indices(packed_indices)
+    
+    compression_rate_calc(2048*2048, 3, 0.1)
\ No newline at end of file
diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh
index 0795956..07e84be 100644
--- a/scripts/gptq_delta.sh
+++ b/scripts/gptq_delta.sh
@@ -1,69 +1,10 @@
-ts -S 8
-CUDA_VISIBLE_DEVICES=0 python opt_delta_autotune.py \
+python opt_delta_autotune.py \
     --dataset wikitext2 \
-    --wbits 2 \
-    --base-model facebook/opt-2.7b \
-    --model lnair/opt-2.7b-wikitext2 \
+    --base-model facebook/opt-350m \
+    --model lnair/opt-350m-wikitext2 \
     --delta \
+    --wbits 2 \
+    --tol 2 \
     --save-delta \
     --save-hf \
     --groupsize 1024
-
-# CUDA_VISIBLE_DEVICES=1 python opt_delta.py \
-#     --dataset wikitext2 \
-#     --wbits 3 \
-#     --delta \
-#     --sparsify_hard_threshold \
-#     --fraction_of_zero 0.95 \
-#     --save-hf \
-#     --groupsize 1024 &
-
-# CUDA_VISIBLE_DEVICES=2 python opt_delta.py \
-#     --dataset wikitext2 \
-#     --wbits 4 \
-#     --delta \
-#     --sparsify_hard_threshold \
-#     --fraction_of_zero 0.95 \
-#     --save-hf \
-#     --groupsize 1024 &
-
-# CUDA_VISIBLE_DEVICES=3 python opt_delta.py \
-#     --dataset wikitext2 \
-#     --wbits 2 \
-#     --delta \
-#     --sparsify_hard_threshold \
-#     --fraction_of_zero 0.99 \
-#     --save-hf \
-#     --groupsize 1024 &
-
-# CUDA_VISIBLE_DEVICES=4 python opt_delta.py \
-#     --dataset wikitext2 \
-#     --wbits 3 \
-#     --delta \
-#     --sparsify_hard_threshold \
-#     --fraction_of_zero 0.99 \
-#     --save-hf \
-#     --groupsize 1024 &
-
-# CUDA_VISIBLE_DEVICES=5 python opt_delta.py \
-#     --dataset wikitext2 \
-#     --wbits 4 \
-#     --delta \
-#     --sparsify_hard_threshold \
-#     --fraction_of_zero 0.99 \
-#     --save-hf \
-#     --groupsize 1024 &
-
-# CUDA_VISIBLE_DEVICES=6 python opt_delta.py \
-#     --dataset wikitext2 \
-#     --wbits 3 \
-#     --delta \
-#     --save-hf \
-#     --groupsize 1024 &
-
-# CUDA_VISIBLE_DEVICES=7 python opt_delta.py \
-#     --dataset wikitext2 \
-#     --wbits 4 \
-#     --delta \
-#     --save-hf \
-#     --groupsize 1024 &
\ No newline at end of file
diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb
index 1751996..86c66d6 100644
--- a/scripts/playground.ipynb
+++ b/scripts/playground.ipynb
@@ -5,12 +5,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import json\n",
-    "with open(\"../.cache/tuned_params.json\", \"r\") as fp:\n",
-    "    data = json.load(fp)\n",
-    "print(data.keys())"
-   ]
+   "source": []
   }
  ],
  "metadata": {
diff --git a/submit.py b/submit.py
index ce84f1f..1830099 100644
--- a/submit.py
+++ b/submit.py
@@ -1,30 +1,18 @@
 import os
 model_relations = {
-    # 'facebook/opt-350m': ['lnair/opt-350m-wikitext2'],
+    #'facebook/opt-350m': ['lnair/opt-350m-wikitext2'],
     # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'],
     # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'],
     'facebook/opt-6.7b': ['mit-han-lab/opt-6.7b-smoothquant'],
-    # 'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'],
-    # 'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus']
-    # 'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b']
+    'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b']
 }
 
-wbits_settings = [2,3,4]
+tols = [4.5, 5, 6.5, 8.0]
 
-sparsity_settings = [0, 0.95, 0.99]
-os.system("ts -S 8")
-for model in model_relations.keys():
-    for target_model in model_relations[model]:
-        for wbits in wbits_settings:
-            for sparsity in sparsity_settings:
-                if sparsity == 0:
-                    cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --save-hf --groupsize 1024"
-                else:
-                    cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --sparsify_hard_threshold --fraction_of_zero {sparsity} --save-hf --groupsize 1024"
-                os.system(cmd)
+os.system("ts -S 7")
 
 for model in model_relations.keys():
     for target_model in model_relations[model]:
-        for wbits in wbits_settings:
-            cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --model {target_model} --base-model {model} --save-hf --groupsize 1024"
+        for tol in tols:
+            cmd = f"TS_VISIBLE_DEVICES=0,2,3,4,5,6,7 ts --gpus 1 python opt_delta_autotune.py --dataset wikitext2 --delta --tol {tol} --model {target_model} --base-model {model} --save-hf --groupsize 1024"
             os.system(cmd)
\ No newline at end of file
diff --git a/utilities/analyze.py b/utilities/analyze.py
new file mode 100644
index 0000000..57c69d1
--- /dev/null
+++ b/utilities/analyze.py
@@ -0,0 +1,16 @@
+import json
+import matplotlib.pyplot as plt
+
+with open(".cache/lnair.opt-350m-wikitext2_delta_tol=2.0.json", "r") as fp:
+    data = json.load(fp)
+
+all_best_losses = []
+for layer_name in data.keys():
+    best_loss = data[layer_name]['choice']['best_loss']
+    all_best_losses.append(best_loss)
+    if (best_loss > 100):
+        print(f"{layer_name} large loss!")
+print(all_best_losses)
+# plot a histogram of the best losses
+plt.hist(all_best_losses, bins=100)
+plt.savefig('.cache/lnair.opt-350m-wikitext2_delta_tol=2.0.png')
\ No newline at end of file
diff --git a/utilities/compression_rate_estimator.py b/utilities/compression_rate_estimator.py
new file mode 100644
index 0000000..90adc48
--- /dev/null
+++ b/utilities/compression_rate_estimator.py
@@ -0,0 +1,53 @@
+import json
+import math
+from modelutils import get_opt, find_layers
+from compression_scripts.model_utils import get_opt, find_layers
+
+
+base_floats = 16
+
+
+base_floats = 16
+
+def calc_compression(path: str, base_model: str):
+    base_model = get_opt(base_model)
+    with open(path, "r") as f:
+        data = json.load(f)
+
+    base_layers = base_model.model.decoder.layers
+
+    total_original_bits = 0
+    total_used_bits = 0
+    sparsity_lists = []
+    total_stats = {}
+    for i in range(len(base_layers)):
+        layer = base_layers[i]
+        subset = find_layers(layer)
+        for name in subset:
+            original_weight = subset[name].weight.data
+            original_weight_count = original_weight.numel()
+            total_original_bits += original_weight_count * base_floats
+            if f"{i}_{name}" in data:
+                config = data[f"{i}_{name}"]["choice"]
+                # save them as indices + values pair
+                nonzeros = (1-config["best_sparsity"]) * original_weight_count
+                # to store values
+                used_bits = nonzeros * config["best_wbit"]
+                # to store indices
+                used_bits += nonzeros * 2 * math.log2(original_weight_count) * 8
+
+                sparsity_lists.append(config["best_sparsity"])
+
+                total_used_bits += used_bits
+            else:
+                raise ValueError(f"Layer {i}_{name} not found in {path}")
+
+    total_stats['compresion_rate'] = total_original_bits / total_used_bits
+    total_stats['sparsity'] = sum(sparsity_lists) / len(sparsity_lists)
+    return total_stats
+
+if __name__=="__main__":
+    path = ".cache/lnair.opt-1.3b-wikitext2_delta_tol=1.0.json"
+    base_model = "facebook/opt-1.3b"
+    stats = calc_compression(path, base_model)
+    print(stats)
\ No newline at end of file
diff --git a/utilities/tuning_analyser.py b/utilities/tuning_analyser.py
new file mode 100644
index 0000000..a7a658b
--- /dev/null
+++ b/utilities/tuning_analyser.py
@@ -0,0 +1,6 @@
+import json
+path = ".cache/lnair.opt-350m-wikitext2_delta_tol=0.2.json"
+
+with open(path, "r") as fp:
+    data = json.load(fp)
+

From 07739c4dbd73e8cff9926b2965d239256463b4aa Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Sun, 14 May 2023 02:13:08 +0000
Subject: [PATCH 21/23] updates on autotuning

---
 autotune_gptq.py        | 175 +++++++++++
 compress_utils.py       |   1 +
 gptj_delta_autotuned.py | 624 ++++++++++++++++++++++++++++++++++++++++
 gptq.py                 |  15 +-
 opt_delta.py            |   1 -
 opt_delta_autotune.py   | 138 +++------
 pack_utils_test.py      |  36 +--
 quant.py                |  10 +-
 tensorio.py             |   3 +-
 9 files changed, 865 insertions(+), 138 deletions(-)
 create mode 100644 autotune_gptq.py
 create mode 100644 gptj_delta_autotuned.py

diff --git a/autotune_gptq.py b/autotune_gptq.py
new file mode 100644
index 0000000..34e987d
--- /dev/null
+++ b/autotune_gptq.py
@@ -0,0 +1,175 @@
+import math
+import time
+import torch
+import transformers
+import torch.nn as nn
+from loguru import logger
+from quant import quantize
+
+DEBUG = False
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+def hard_threshold(x, fraction_of_zero=0.1):
+    if fraction_of_zero == 0:
+        return x
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor)
+    return mask * x
+
+class GPTQ:
+    def __init__(self, layer):
+        self.layer = layer
+        self.original_weight = layer.weight.data.clone()
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+
+    def add_batch(self, inp, out):
+        self.inp1 = inp
+        self.out1 = out
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+
+    def fasterquant(
+        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None
+    ):
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        if write:
+            del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if groupsize != -1:
+                    if (i1 + i) % groupsize == 0:
+                        self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
+
+                q = quantize(
+                    w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq
+                ).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d ** 2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+            if DEBUG:
+                pass
+                #self.layer.weight.data[:, :i2] = Q[:, :i2]
+                #self.layer.weight.data[:, i2:] = W[:, i2:]
+                #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                #print(torch.sum(Losses))
+
+        torch.cuda.synchronize()
+        total_time = time.time() - tick
+        # print('time %.2f' % total_time)
+        # error = torch.sum(Losses).item()
+        # print('error', error)
+
+        if actorder:
+            invperm = torch.argsort(perm)
+            Q = Q[:, invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        # here report the loss of the quantized layer vs. the original layer
+        new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype)
+        losses = {}
+        if sparsity is None:
+            sparsed_new_weight = new_weight
+            losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+        else:
+            for s_sity in sparsity:
+                if write:
+                    logger.info(f"HT with: sparsity={s_sity}")
+                sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity)
+                losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
+                if losses[s_sity] > 100:
+                    logger.info(f"{sparsed_new_weight}")
+                    logger.info(f"{new_weight}")
+                    logger.info(f"{sparsed_new_weight.shape}")
+                    logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}")
+        if write:
+            self.layer.weight.data = sparsed_new_weight
+        return losses
+
+    def free(self):
+        if DEBUG:
+            self.inp1 = None
+            self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()
\ No newline at end of file
diff --git a/compress_utils.py b/compress_utils.py
index 20e20f1..341704f 100644
--- a/compress_utils.py
+++ b/compress_utils.py
@@ -4,6 +4,7 @@
 import numpy as np
 from torch.utils.dlpack import to_dlpack, from_dlpack
 from quant import Quantizer
+
 def cupy_to_tensor(x):
     return from_dlpack(x.toDlpack())
 
diff --git a/gptj_delta_autotuned.py b/gptj_delta_autotuned.py
new file mode 100644
index 0000000..bd883b9
--- /dev/null
+++ b/gptj_delta_autotuned.py
@@ -0,0 +1,624 @@
+
+import time
+import math
+
+import torch
+import torch.nn as nn
+import transformers
+
+from gptq import *
+from modelutils import *
+from quant import *
+import os
+import copy
+
+def get_gptj(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import GPTJForCausalLM
+    model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    model.seqlen = model.config.max_position_embeddings
+    print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad))
+    return model
+
+@torch.no_grad()
+def gptj_sequential(model, dataloader, dev, means=None, stds=None):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    #print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=False, mse=False
+            )
+        
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_sequential_delta(model, delta_model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.transformer.h
+    delta_layers = delta_model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=args.sym, mse=False
+            )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+            gptq[name].free()
+        
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = original_outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    # print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache ['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
+        try:
+            # print(batch.shape)
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+    
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+
+        if args.nearest:
+            subset  = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    model.transformer.ln_f = model.transformer.ln_f.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+    
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        hidden_states = model.transformer.ln_f(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+    
+
+    model.config.use_cache = use_cache
+
+def gptj_pack(model, quantizers, wbits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [QuantLinear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name],scale,zero = quantizers[name]
+        quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
+        qlayers[name].pack(layers[name], scale, zero)
+    print('Done!')
+    return model
+
+def load_quant(model, checkpoint, wbits, groupsize):
+    from transformers import GPTJConfig, GPTJForCausalLM
+    config = GPTJConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = GPTJForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits, groupsize)
+
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done!')
+
+    return model
+
+def gptj_multigpu(model, gpus):
+    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
+    if hasattr(model.model, 'norm') and model.model.norm:
+        model.model.norm = model.model.norm.to(gpus[-1])
+    import copy
+    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
+
+    cache = {'mask': None}
+
+    class MoveModule(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self_module = module
+            self.dev = next(iter(self.module.parameters())).device
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache ['mask'].device != self.dev:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+
+    layers = model.model.layers
+    pergpu = math.ceil(len(layers) / len(gpus))
+    for i in range(len(layers)):
+        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
+
+    model.gpus = gpus
+
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.model.layers):
+        layer.register_forward_hook(clear_past(i))
+
+    print('Benchmarking ...')
+
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    max_memory = 0
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+        
+            out = model(
+                input_ids[:, i:i+1],
+                past_key_values=cache['past'],
+                attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
+            )
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024)
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_keys_values)
+            del out
+        sync()
+        import numpy as np
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):',max_memory)
+
+
+        
+def main(args):
+    print(args)
+    num_params_saved_lr = 0
+    num_params = 0
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        if args.delta and args.wbits<16:
+            model = get_gptj(args.model)
+            model.eval()
+            base_model = get_gptj(args.base_model)
+            base_model.eval()
+            dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+            original_finetuned_model = copy.deepcopy(model)
+            for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+                finetuned_p.data = (finetuned_p.data-base_p.data).clone()
+        else:
+            model = get_gptj(args.model)
+            model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        if args.delta:
+            tick = time.time()
+            quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV)
+
+            comp_time = time.time()-tick
+        else:
+            quantizers = gptj_sequential(model, dataloader, DEV)
+    
+    if args.delta and args.wbits<16:
+        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+            if args.sparsify_hard_threshold:
+                print('Hard Thresholding...')
+                W = finetuned_p.data
+                finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+            if args.rank>0 and len(finetuned_p.shape) == 2:
+                print('Finding Low Rank Approximation...')
+                A = finetuned_p.data.float()
+                U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
+                A  = U @ torch.diag_embed(S) @ Vh.T
+                finetuned_p.data =  A.half()
+                num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
+            num_params += torch.numel(finetuned_p.data)
+            finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            gptj_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+
+    dataset = args.dataset 
+    dataloader, testloader = get_loaders(
+        dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    
+    ppl = gptj_eval(model, testloader, DEV)
+    print(ppl)
+
+    if args.rank > 0:
+        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("Number of params without low rank ", n_params)
+        print("Number of params with low rank", n_params - num_params_saved_lr)
+    if args.save:
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save) 
+    return ppl
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        '--model', type=str, default='togethercomputer/GPT-JT-6B-v1',
+        help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.'
+    )
+    parser.add_argument(
+        '--base_model', type=str, default='EleutherAI/gpt-j-6b',
+        help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
+    )
+    parser.add_argument(
+        '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    )
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--save', type=str, default='',
+        help='Save the quantized GPT-J model under this name.'
+    )
+    parser.add_argument(
+        '--save_safetensors', type=str, default='',
+        help='Save the quantized GPT-J model as a  `.safetensors` ckpt'
+    )
+    parser.add_argument(
+        '--load', type=str, default='',
+        help='Load the quantized GPT-J model'
+    )
+    parser.add_argument(
+        '--benchmark', type=int, default=0,
+        help='Number of tokens to use for benchmarking.'
+    )
+    parser.add_argument(
+        '--check', action='store_true',
+        help='Whether to compute perpexity during benchmarking for verification.'
+    )
+    parser.add_argument(
+        '--delta', action='store_true',
+        help='Whether to use delta compression'
+    )
+    parser.add_argument(
+        '--sparsify_hard_threshold', action='store_true',
+        help='Whether to add sparsity'
+    )
+    parser.add_argument(
+        '--fraction_of_zero', type=float, default=0.99,
+        help='Sparsity ratio'
+    )
+    parser.add_argument(
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
+    )
+    parser.add_argument(
+        '--sym', action='store_true', default=True,
+        help='Whether to use symmetric quantization'
+    )
+    parser.add_argument(
+        '--trits', action='store_true', default=False, 
+        help='Whether to use trits'
+    )
+    parser.add_argument('--act_order', type=str, default=False)
+    
+    args = parser.parse_args()
+        
+    results = PrettyTable()
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [4, 3, 2]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+        print(results)
+        with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+            f.write(str(results))
+    print('finished.')
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index 34e987d..80ea2d1 100644
--- a/gptq.py
+++ b/gptq.py
@@ -13,13 +13,13 @@
 
 def hard_threshold(x, fraction_of_zero=0.1):
     if fraction_of_zero == 0:
-        return x
+        return x, None
     y, _ = torch.sort(x.view(-1).abs().clone())
     num_params = torch.numel(x)
     thresh_index = int(num_params * fraction_of_zero)
     threshold = y[thresh_index]
     mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor)
-    return mask * x
+    return mask * x, mask
 
 class GPTQ:
     def __init__(self, layer):
@@ -147,23 +147,20 @@ def fasterquant(
         # here report the loss of the quantized layer vs. the original layer
         new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype)
         losses = {}
+        mask = None
         if sparsity is None:
             sparsed_new_weight = new_weight
             losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
         else:
             for s_sity in sparsity:
+                sparsed_new_weight, mask = hard_threshold(new_weight, fraction_of_zero=s_sity)
                 if write:
                     logger.info(f"HT with: sparsity={s_sity}")
-                sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity)
                 losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2)
-                if losses[s_sity] > 100:
-                    logger.info(f"{sparsed_new_weight}")
-                    logger.info(f"{new_weight}")
-                    logger.info(f"{sparsed_new_weight.shape}")
-                    logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}")
+                
         if write:
             self.layer.weight.data = sparsed_new_weight
-        return losses
+        return losses, mask
 
     def free(self):
         if DEBUG:
diff --git a/opt_delta.py b/opt_delta.py
index 38e3547..10a34d3 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -473,7 +473,6 @@ def sync():
 
 def main(args):
     print(args)
-    tensor_io = TensorIO('sparse')
     num_params_saved_lr = 0
     num_params = 0
     if args.load:
diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py
index 4e2911e..ad49111 100644
--- a/opt_delta_autotune.py
+++ b/opt_delta_autotune.py
@@ -1,3 +1,4 @@
+import os
 import copy
 import time
 import json
@@ -10,6 +11,7 @@
 from loguru import logger
 from tensorio import TensorIO, model_packing
 from transformers import AutoTokenizer, AutoModel
+import torchvision.transforms as T
 # from prettytable import PrettyTable
 
 def get_opt(model):
@@ -28,15 +30,6 @@ def skip(*args, **kwargs):
     model.seqlen = model.config.max_position_embeddings
     return model
 
-
-def hard_threshold(x, fraction_of_zero=0.1):
-    y, _ = torch.sort(x.view(-1).abs().clone())
-    num_params = torch.numel(x)
-    thresh_index = int(num_params * fraction_of_zero)
-    threshold = y[thresh_index]
-    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
-    return mask * x
-
 @torch.no_grad()
 def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2):
     search_space = {
@@ -45,10 +38,15 @@ def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2):
     }
     base_floats = 16
     compression_rates = {}
+    masks = {}
     for wbit in search_space['wbits']:
         for sparsity in search_space['sparsities']:
             compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity)
-    compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True)
+    compression_rates = sorted(
+        compression_rates.items(),
+        key=lambda x: x[1],
+        reverse=True
+    )
     
     use_cache = model.config.use_cache
     model.config.use_cache = False
@@ -146,7 +144,7 @@ def tmp(_, inp, out):
         for name in subset:
             logger.info(f"Quantizing {i}.{name} ...")
             for wbit in search_space['wbits']:
-                losses=tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant(
+                losses, _ =tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant(
                     percdamp=args.percdamp,
                     groupsize=args.groupsize,
                     actorder=args.act_order,
@@ -184,13 +182,16 @@ def tmp(_, inp, out):
             best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss']
             # redo the actual work, and write to the layer
             logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...")
-            tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant(
+            loss, mask = tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant(
                 percdamp=args.percdamp,
                 groupsize=args.groupsize,
                 actorder=args.act_order,
                 write=True,
                 sparsity = [best_sparsity],
             )
+            if mask is not None:
+                masks[f'{i}_{name}'] = mask
+
             quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].quantizer
             tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].free()
             tuned_configs[f'{i}_{name}']['choice'] = {
@@ -216,7 +217,7 @@ def tmp(_, inp, out):
 
     model.config.use_cache = use_cache
 
-    return quantizers, tuned_configs
+    return quantizers, tuned_configs, masks
 
 @torch.no_grad()
 def opt_eval(model, testenc, dev):
@@ -316,53 +317,6 @@ def forward(self, inp, **kwargs):
     model.config.use_cache = use_cache
     return ppl.item()
 
-
-# TODO: perform packing on GPU
-def opt_pack3(model, quantizers):
-    layers = find_layers(model)
-    layers = {n: layers[n] for n in quantizers}
-    make_quant3(model, quantizers, faster=args.faster_kernel)
-    qlayers = find_layers(model, [Quant3Linear])
-    print("Packing ...")
-    for name in qlayers:
-        print(name)
-        quantizers[name] = quantizers[name].cpu()
-        qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
-    print("Done.")
-    return model
-
-
-def load_quant3(model, checkpoint):
-    from transformers import OPTConfig, OPTForCausalLM
-
-    config = OPTConfig.from_pretrained(model)
-
-    def noop(*args, **kwargs):
-        pass
-
-    torch.nn.init.kaiming_uniform_ = noop
-    torch.nn.init.uniform_ = noop
-    torch.nn.init.normal_ = noop
-
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = OPTForCausalLM(config)
-    torch.set_default_dtype(torch.float)
-    model = model.eval()
-    layers = find_layers(model)
-    for name in ["model.decoder.project_out", "model.decoder.project_in", "lm_head"]:
-        if name in layers:
-            del layers[name]
-    make_quant3(model, layers, faster=args.faster_kernel)
-
-    print("Loading model ...")
-    model.load_state_dict(torch.load(checkpoint))
-    model.seqlen = model.config.max_position_embeddings
-    print("Done.")
-
-    return model
-
 def benchmark(model, input_ids, check=False):
     input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else DEV)
     torch.cuda.synchronize()
@@ -421,47 +375,45 @@ def sync():
 def main(args):
     print(args)
     num_params = 0
-    if args.load:
-        model = load_quant3(args.model, args.load)
+    if args.delta and args.wbits < 16:
+        model = get_opt(args.model)
+        model.eval()
+        base_model = get_opt(args.base_model)
+        base_model.eval()
+        dataloader, testloader = get_loaders(
+            args.dataset,
+            nsamples=args.nsamples,
+            seed=args.seed,
+            model=args.model,
+            seqlen=model.seqlen,
+        )
+        original_finetuned_model = copy.deepcopy(model)
+        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+            finetuned_p.data = (finetuned_p.data - base_p.data).clone()
     else:
-        if args.delta and args.wbits < 16:
-            model = get_opt(args.model)
-            model.eval()
-            base_model = get_opt(args.base_model)
-            base_model.eval()
-            dataloader, testloader = get_loaders(
-                args.dataset,
-                nsamples=args.nsamples,
-                seed=args.seed,
-                model=args.model,
-                seqlen=model.seqlen,
-            )
-            original_finetuned_model = copy.deepcopy(model)
-            for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
-                finetuned_p.data = (finetuned_p.data - base_p.data).clone()
-        else:
-            model = get_opt(args.model)
-            model.eval()
-
-    dataloader, testloader = get_loaders(
-        args.dataset,
-        nsamples=args.nsamples,
-        seed=args.seed,
-        model=args.model,
-        seqlen=model.seqlen,
-    )
+        model = get_opt(args.model)
+        model.eval()
 
     if args.wbits < 16:
         if args.delta:
             tick = time.time()
-            quantizers, tuned_params = opt_sequential_delta(
+            quantizers, tuned_params, masks = opt_sequential_delta(
                 original_finetuned_model, model, dataloader, DEV, args.tol
             )
-            with open(f".cache/{args.model.replace('/', '.')}_delta_tol={args.tol}.json", "w+") as f:
+            data_dir = os.path.join(".cache", args.model.replace('/', '.')) 
+            os.makedirs(data_dir, exist_ok=True)
+            with open(f".cache/{args.model.replace('/', '.')}/delta_tol={args.tol}_tuned_params.json", "w+") as f:
                 json.dump(tuned_params, f)
-            comp_time = time.time() - tick
+            # iterate over all the dict keys in masks
+            transforms = T.ToPILImage()
+            for key in masks.keys():
+                logger.info(f"Saving mask for {key}")
+                binmask = transforms(masks[key])
+                binmask = binmask.convert("1")
+                binmask.save(os.path.join(data_dir, f"delta_tol={args.tol}_mask_{key}.bmp"))
         else:
             raise NotImplementedError
+    
     if args.delta and args.wbits < 16:
         for idx, (base_p, finetuned_p) in enumerate(
             zip(base_model.parameters(), model.parameters())
@@ -503,10 +455,6 @@ def main(args):
         model.save_pretrained(hf_path)
         tokenizer = AutoTokenizer.from_pretrained(args.model)
         tokenizer.save_pretrained(hf_path)
-    else:
-        opt_pack3(model, quantizers)
-        torch.save(model.state_dict(), args.save)
-
 
 if __name__ == "__main__":
     import argparse
diff --git a/pack_utils_test.py b/pack_utils_test.py
index 58f1ad0..efc7963 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -1,25 +1,14 @@
 import torch
-from quant import quantize, Quantizer
-from safetensors import safe_open
-from pack_utils import SparseTensor, pack_to_bits, unpack_from_bits
-from safetensors.torch import save_file
+from quant import Quantizer
 from opt_delta import hard_threshold
+from safetensors.torch import save_file
 
 QUANTIZED_BITS = 4
 
 if __name__=="__main__":
-    """
-    The process:
-    1. Given a weight, quantize it first
-    2. Then do sparsification
-
-    To test our pack/unpack, we need to do the following:
-    1. After the sparsification, we pack the weight and store on disk
-    2. Compare the original weight with the unpacked weight
-    """
-
     torch.set_printoptions(precision=4)
-    b = torch.rand((2048, 2048), dtype=torch.float32)
+    b = torch.rand((1, 1), dtype=torch.float32)
+    print(b)
     # save b
     save_file({'wb1': b}, '.cache/original_b.safetensor')
     quantizer = Quantizer()
@@ -28,18 +17,5 @@
     )
     quantizer.find_params(b, weight=True)
     b_q = quantizer.quantize(b)
-    sparsed_b_q = hard_threshold(b_q, 0.01)
-
-    q_weight = pack_to_bits(sparsed_b_q, quantizer, QUANTIZED_BITS, groupsize=sparsed_b_q.shape[0])
-    sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1)
-    sparse_t.to_disk('.cache/sparse_b.safetensor')
-    # now load it back
-    restored_sparse_t = SparseTensor.from_disk('.cache/sparse_b.safetensor')
-    restored_weight = restored_sparse_t.tensor
-    # this is what we restored from disk
-    restored_weight = unpack_from_bits(restored_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0])
-    print(f"Original weight: {sparsed_b_q}")
-    print(f"Restored weight: {restored_weight}")
-    # count the number of non-zero elements
-    print(f"Original weight: {sparsed_b_q.nonzero().shape[0]}")
-    print(f"Restored weight: {restored_weight.nonzero().shape[0]}")
\ No newline at end of file
+    print(b_q)
+    
\ No newline at end of file
diff --git a/quant.py b/quant.py
index f23099a..f4a4983 100644
--- a/quant.py
+++ b/quant.py
@@ -4,6 +4,9 @@
 import torch.nn as nn
 
 def quantize(x, scale, zero, maxq):
+    print(scale)
+    print(zero)
+    print(maxq)
     if maxq < 0:
         return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
     q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
@@ -83,7 +86,12 @@ def find_params(self, x, weight=False):
                 xmax1 = p * xmax
                 scale1 = (xmax1 - xmin1) / self.maxq
                 zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
-                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q = quantize(
+                    x,
+                    scale1.unsqueeze(1),
+                    zero1.unsqueeze(1),
+                    self.maxq
+                )
                 q -= x
                 q.abs_()
                 q.pow_(self.norm)
diff --git a/tensorio.py b/tensorio.py
index 46cdd2f..c5526f8 100644
--- a/tensorio.py
+++ b/tensorio.py
@@ -52,5 +52,4 @@ def model_packing(model, quantizers, bits, reformat='none'):
         if name in quantizers:
             quantizers[name] = quantizers[name].cpu()
             x, scale = compress_flexible_nbits(layers[name].weight.data.cuda(), bits)
-    return x, scale
-
+    return x, scale
\ No newline at end of file

From db0dc5193dc03bc58a89f906dae88cb3bc29f030 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Mon, 15 May 2023 16:44:32 +0000
Subject: [PATCH 22/23] add datautils for generic jsonl

---
 .gitignore                              |   3 +-
 compress_utils.py                       |   1 -
 datautils.py                            |  65 +++++++++++-
 gptq.py                                 |  11 +-
 opt_delta_autotune.py                   |   8 +-
 opt_eval_ppl.py                         | 128 ++++++++++++++++++++++++
 quant.py                                |   5 +-
 scripts/opt_delta_exp.sh                |  10 ++
 utilities/compression_rate_estimator.py |   4 -
 utilities/convert_to_hf.py              |   6 +-
 utilities/cr_cal.py                     |   0
 utilities/to_csv.py                     |  19 ++++
 12 files changed, 237 insertions(+), 23 deletions(-)
 create mode 100644 opt_eval_ppl.py
 create mode 100644 scripts/opt_delta_exp.sh
 create mode 100644 utilities/cr_cal.py
 create mode 100644 utilities/to_csv.py

diff --git a/.gitignore b/.gitignore
index f73b20e..73ab46b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ outputs_past/
 packed_delta
 .cache
 delta_outputs/
-.io/
\ No newline at end of file
+.io/
+outputs_exp/
\ No newline at end of file
diff --git a/compress_utils.py b/compress_utils.py
index 341704f..143f017 100644
--- a/compress_utils.py
+++ b/compress_utils.py
@@ -283,7 +283,6 @@ def _compress_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512,
     
     return x, scale
 
-
 def compress_flexible_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512,
                                       stochastic=False, minimum_stochastic_distance=0.2):
     # support any bits
diff --git a/datautils.py b/datautils.py
index 045121a..51003f6 100644
--- a/datautils.py
+++ b/datautils.py
@@ -1,5 +1,8 @@
-import numpy as np
+import json
 import torch
+import random
+import numpy as np
+from transformers import AutoTokenizer
 
 def set_seed(seed):
     np.random.seed(seed)
@@ -157,6 +160,46 @@ def __init__(self, input_ids):
 
     return trainloader, valenc
 
+def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256):
+    """
+    train_path: path to train jsonl file
+    test_path: path to test jsonl file
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    with open(train_path, 'r') as f:
+        traindata = [json.loads(line) for line in f.readlines()]
+    with open(val_path, 'r') as f:
+        valdata = [json.loads(line) for line in f.readlines()]
+    traindata = {"text": [d['text'] for d in traindata]}
+    testdata = {"text": [d['text'] for d in testdata]}
+    set_seed(seed)
+
+    trainloader = []
+    for _ in range(n_samples):
+        # for all datasets, we take the samples that are longer than seq_len
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] >= seq_len:
+                break
+        # then clip the samples to seq_len
+        i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1)
+        j = i + seq_len
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    if val_size is not None:
+        valenc = tokenizer(' '.join(valdata[:val_size]['text']), return_tensors='pt')
+    else:
+        valenc = tokenizer(' '.join(valdata['text']), return_tensors='pt')
+    valenc = valenc.input_ids[:, :(val_seq_len * seq_len)]
+
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+    valenc = TokenizerWrapper(valenc)
+    return trainloader, valenc
 
 def get_loaders(
     name, nsamples=128, seed=0, seqlen=2048, model=''
@@ -171,3 +214,23 @@ def get_loaders(
         if 'new' in name:
             return get_c4_new(nsamples, seed, seqlen, model)
         return get_c4(nsamples, seed, seqlen, model)
+    if name == "answer_verification":
+        return get_jsonl(".cache/ni_calib/train/answer_verification.jsonl", ".cache/ni_calib/val/answer_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "coherence_classification":
+        return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "commonsense_classification":
+        return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "dialogue_state_tracking":
+        return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "fact_verification":
+        return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "gender_classification":
+        return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "irony_detection":
+        return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "stance_detection":
+        return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "toxic_language_detection":
+        return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+    if name == "word_semantics":
+        return get_jsonl(".cache/ni_calib/train/word_semantics.jsonl", ".cache/ni_calib/test/word_semantics.jsonl", nsamples, seed, seqlen, model, val_size=1000)
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index 80ea2d1..87dd8cf 100644
--- a/gptq.py
+++ b/gptq.py
@@ -11,12 +11,18 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
-def hard_threshold(x, fraction_of_zero=0.1):
+def hard_threshold(x, fraction_of_zero=0.1, random_sparsification=0.5):
     if fraction_of_zero == 0:
         return x, None
+    # randomly set random_sparsification of the weights to zero
+    if random_sparsification > 0:
+        logger.info(f"Randomly sparsifying the weights with {random_sparsification}")
+        mask = torch.rand(x.shape, device=x.device) > random_sparsification
+        x = x * mask
     y, _ = torch.sort(x.view(-1).abs().clone())
     num_params = torch.numel(x)
-    thresh_index = int(num_params * fraction_of_zero)
+    
+    thresh_index = int(num_params * fraction_of_zero * (1/random_sparsification))
     threshold = y[thresh_index]
     mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor)
     return mask * x, mask
@@ -46,7 +52,6 @@ def add_batch(self, inp, out):
             if len(inp.shape) == 3:
                 inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
-        
         self.H *= self.nsamples / (self.nsamples + tmp)
         self.nsamples += tmp
         # inp = inp.float()
diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py
index ad49111..ac88b37 100644
--- a/opt_delta_autotune.py
+++ b/opt_delta_autotune.py
@@ -47,7 +47,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2):
         key=lambda x: x[1],
         reverse=True
     )
-    
+
     use_cache = model.config.use_cache
     model.config.use_cache = False
     layers = model.model.decoder.layers
@@ -144,7 +144,7 @@ def tmp(_, inp, out):
         for name in subset:
             logger.info(f"Quantizing {i}.{name} ...")
             for wbit in search_space['wbits']:
-                losses, _ =tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant(
+                losses, _ = tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant(
                     percdamp=args.percdamp,
                     groupsize=args.groupsize,
                     actorder=args.act_order,
@@ -449,9 +449,9 @@ def main(args):
 
     if args.save_hf:
         if args.delta:
-            hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}"
+            hf_path = f"outputs_exp/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}"
         else:
-            hf_path = f"outputs/{args.model.replace('/', '.')}_autotuned_tol={args.tol}"
+            hf_path = f"outputs_exp/{args.model.replace('/', '.')}_autotuned_tol={args.tol}"
         model.save_pretrained(hf_path)
         tokenizer = AutoTokenizer.from_pretrained(args.model)
         tokenizer.save_pretrained(hf_path)
diff --git a/opt_eval_ppl.py b/opt_eval_ppl.py
new file mode 100644
index 0000000..bf41c5d
--- /dev/null
+++ b/opt_eval_ppl.py
@@ -0,0 +1,128 @@
+import os
+import json
+import torch
+import torch.nn as nn
+from modelutils import get_opt
+from datautils import get_loaders
+
+BENCHMARK = 2048
+
+dataset = 'wikitext2'
+
+nsamples = 128
+
+@torch.no_grad()
+def opt_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.decoder.final_layer_norm is not None:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
+    if model.model.decoder.project_out is not None:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.decoder.final_layer_norm is not None:
+            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+        if model.model.decoder.project_out is not None:
+            hidden_states = model.model.decoder.project_out(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    model.config.use_cache = use_cache
+    return ppl.item()
+
+models = os.listdir("outputs")
+res = {}
+models = [
+    # 'facebook/opt-1.3b',
+    # 'facebook/opt-350m', 
+    'facebook/opt-2.7b', 
+    # 'lnair/opt-350m-wikitext2',
+    # 'lnair/opt-1.3b-wikitext2',
+    'lnair/opt-2.7b-wikitext2'
+]
+for model_name in models:
+    # model_path = os.path.join("outputs", model_name)
+    model = get_opt(model_name)
+    model.to("cuda")
+    _, testloader = get_loaders(
+        dataset, nsamples=128, seed=0, model=model_name, seqlen=model.seqlen
+    )
+    ppl = opt_eval(model, testloader, model.device)
+    res[model_name] = ppl
+    print(res)
+    with open("ppl_res.json", "w") as f:
+        json.dump(res, f)
\ No newline at end of file
diff --git a/quant.py b/quant.py
index f4a4983..386845c 100644
--- a/quant.py
+++ b/quant.py
@@ -1,12 +1,9 @@
 import math
-import numpy as np
 import torch
+import numpy as np
 import torch.nn as nn
 
 def quantize(x, scale, zero, maxq):
-    print(scale)
-    print(zero)
-    print(maxq)
     if maxq < 0:
         return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
     q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
diff --git a/scripts/opt_delta_exp.sh b/scripts/opt_delta_exp.sh
new file mode 100644
index 0000000..2b41218
--- /dev/null
+++ b/scripts/opt_delta_exp.sh
@@ -0,0 +1,10 @@
+python opt_delta_autotune.py \
+    --dataset wikitext2 \
+    --base-model facebook/opt-1.3b \
+    --model lnair/opt-1.3b-wikitext2 \
+    --delta \
+    --wbits 2 \
+    --tol 2 \
+    --save-delta \
+    --save-hf \
+    --groupsize 1024
\ No newline at end of file
diff --git a/utilities/compression_rate_estimator.py b/utilities/compression_rate_estimator.py
index 90adc48..2b8a564 100644
--- a/utilities/compression_rate_estimator.py
+++ b/utilities/compression_rate_estimator.py
@@ -3,10 +3,6 @@
 from modelutils import get_opt, find_layers
 from compression_scripts.model_utils import get_opt, find_layers
 
-
-base_floats = 16
-
-
 base_floats = 16
 
 def calc_compression(path: str, base_model: str):
diff --git a/utilities/convert_to_hf.py b/utilities/convert_to_hf.py
index 8111506..36f7262 100644
--- a/utilities/convert_to_hf.py
+++ b/utilities/convert_to_hf.py
@@ -1,16 +1,12 @@
+import os
 import torch
 import torch.nn as nn
 
 from transformers import GPTJForCausalLM
-
 from transformers import AutoConfig, AutoTokenizer
-
 from transformers.modeling_utils import no_init_weights
-import os
-
 
 def create_emtpy_gptj(config):
-
     import torch
     import torch.nn as nn
 
diff --git a/utilities/cr_cal.py b/utilities/cr_cal.py
new file mode 100644
index 0000000..e69de29
diff --git a/utilities/to_csv.py b/utilities/to_csv.py
new file mode 100644
index 0000000..db809a2
--- /dev/null
+++ b/utilities/to_csv.py
@@ -0,0 +1,19 @@
+import json
+import pandas as pd
+with open('ppl_res.json') as f:
+    res = json.load(f)
+# convert to csv
+sizes_group = ['350m', '1.3b', '2.7b']
+results = []
+for key in res.keys():
+    results.append({
+        'model': key,
+        'perplexity': res[key],
+    })
+df = pd.DataFrame(results)
+
+# pivot table such that columns is different models, rows is different perplexity
+for size in sizes_group:
+    subdf = df[df['model'].str.contains(size)]
+    subdf = subdf.pivot_table(values='perplexity', columns='model')
+    subdf.to_csv(f'ppl_res_{size}.csv', index=False)
\ No newline at end of file

From 5d2c0861b1c87ce2f12cd428080427ce31fd5cb1 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Mon, 15 May 2023 21:59:06 +0000
Subject: [PATCH 23/23] generic data loader

---
 datautils.py          | 70 ++++++++++++++++++++++++++++++-------------
 opt_delta_autotune.py |  2 --
 opt_eval_ppl.py       | 29 +++++++++---------
 pack_utils_test.py    |  1 -
 ppl_res.json          |  1 +
 scripts/gptq_delta.sh |  6 ++--
 6 files changed, 69 insertions(+), 40 deletions(-)
 create mode 100644 ppl_res.json

diff --git a/datautils.py b/datautils.py
index 51003f6..71a8616 100644
--- a/datautils.py
+++ b/datautils.py
@@ -2,6 +2,7 @@
 import torch
 import random
 import numpy as np
+from datasets import Dataset
 from transformers import AutoTokenizer
 
 def set_seed(seed):
@@ -160,7 +161,7 @@ def __init__(self, input_ids):
 
     return trainloader, valenc
 
-def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256):
+def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256, padding=False):
     """
     train_path: path to train jsonl file
     test_path: path to test jsonl file
@@ -171,7 +172,9 @@ def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_si
     with open(val_path, 'r') as f:
         valdata = [json.loads(line) for line in f.readlines()]
     traindata = {"text": [d['text'] for d in traindata]}
-    testdata = {"text": [d['text'] for d in testdata]}
+    valdata = {"text": [d['text'] for d in valdata]}
+    traindata = Dataset.from_dict(traindata)
+    valdata = Dataset.from_dict(valdata)
     set_seed(seed)
 
     trainloader = []
@@ -179,16 +182,25 @@ def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_si
         # for all datasets, we take the samples that are longer than seq_len
         while True:
             i = random.randint(0, len(traindata) - 1)
-            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if padding:
+                trainenc = tokenizer(traindata[i]['text'], padding='max_length', truncation=True, max_length=seq_len, return_tensors='pt')
+            else:
+                trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
             if trainenc.input_ids.shape[1] >= seq_len:
                 break
-        # then clip the samples to seq_len
-        i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1)
-        j = i + seq_len
-        inp = trainenc.input_ids[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
+        if not padding:
+            # then clip the samples to seq_len
+            i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1)
+            j = i + seq_len
+            inp = trainenc.input_ids[:, i:j]
+            tar = inp.clone()
+            tar[:, :-1] = -100
+            trainloader.append((inp, tar))
+        else:
+            inp = trainenc.input_ids
+            tar = inp.clone()
+            tar[:, :-1] = -100
+            trainloader.append((inp, tar))
     if val_size is not None:
         valenc = tokenizer(' '.join(valdata[:val_size]['text']), return_tensors='pt')
     else:
@@ -215,22 +227,40 @@ def get_loaders(
             return get_c4_new(nsamples, seed, seqlen, model)
         return get_c4(nsamples, seed, seqlen, model)
     if name == "answer_verification":
-        return get_jsonl(".cache/ni_calib/train/answer_verification.jsonl", ".cache/ni_calib/val/answer_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(
+            ".cache/ni_calib/train/answer_verification.jsonl", 
+            ".cache/ni_calib/test/answer_verification.jsonl", 
+            nsamples, 
+            seed, 
+            seqlen, 
+            model, 
+            val_size=1000, 
+            padding=True
+        )
     if name == "coherence_classification":
-        return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "commonsense_classification":
-        return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "dialogue_state_tracking":
-        return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "fact_verification":
-        return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "gender_classification":
-        return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "irony_detection":
-        return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "stance_detection":
-        return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "toxic_language_detection":
-        return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000)
+        return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True)
     if name == "word_semantics":
-        return get_jsonl(".cache/ni_calib/train/word_semantics.jsonl", ".cache/ni_calib/test/word_semantics.jsonl", nsamples, seed, seqlen, model, val_size=1000)
\ No newline at end of file
+        return get_jsonl(
+            ".cache/ni_calib/train/word_semantics.jsonl", 
+            ".cache/ni_calib/test/word_semantics.jsonl", 
+            nsamples, 
+            seed, 
+            seqlen, 
+            model, 
+            val_size=1000, 
+            padding=True
+        )
\ No newline at end of file
diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py
index ac88b37..937ab7f 100644
--- a/opt_delta_autotune.py
+++ b/opt_delta_autotune.py
@@ -371,7 +371,6 @@ def sync():
         if check:
             print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item())
 
-
 def main(args):
     print(args)
     num_params = 0
@@ -471,7 +470,6 @@ def main(args):
     parser.add_argument(
         "--dataset",
         type=str,
-        choices=["wikitext2", "ptb", "c4"],
         default="wikitext2",
         help="Where to extract calibration data from.",
     )
diff --git a/opt_eval_ppl.py b/opt_eval_ppl.py
index bf41c5d..ca290b5 100644
--- a/opt_eval_ppl.py
+++ b/opt_eval_ppl.py
@@ -7,8 +7,6 @@
 
 BENCHMARK = 2048
 
-dataset = 'wikitext2'
-
 nsamples = 128
 
 @torch.no_grad()
@@ -104,23 +102,26 @@ def forward(self, inp, **kwargs):
     model.config.use_cache = use_cache
     return ppl.item()
 
-models = os.listdir("outputs")
+models = os.listdir(".cache/models")
 res = {}
-models = [
-    # 'facebook/opt-1.3b',
-    # 'facebook/opt-350m', 
-    'facebook/opt-2.7b', 
-    # 'lnair/opt-350m-wikitext2',
-    # 'lnair/opt-1.3b-wikitext2',
-    'lnair/opt-2.7b-wikitext2'
-]
+# models = [
+#     # 'facebook/opt-1.3b',
+#     # 'facebook/opt-350m', 
+#     'facebook/opt-2.7b', 
+#     # 'lnair/opt-350m-wikitext2',
+#     # 'lnair/opt-1.3b-wikitext2',
+#     'lnair/opt-2.7b-wikitext2'
+# ]
 for model_name in models:
-    # model_path = os.path.join("outputs", model_name)
-    model = get_opt(model_name)
+    dataset = model_name
+    model_path = os.path.join(".cache", "models", model_name)
+    model = get_opt(model_path)
     model.to("cuda")
+    print("model loaded")
     _, testloader = get_loaders(
-        dataset, nsamples=128, seed=0, model=model_name, seqlen=model.seqlen
+        dataset, nsamples=128, seed=0, model=model_path, seqlen=model.seqlen
     )
+    print("data loaded")
     ppl = opt_eval(model, testloader, model.device)
     res[model_name] = ppl
     print(res)
diff --git a/pack_utils_test.py b/pack_utils_test.py
index efc7963..563f31d 100644
--- a/pack_utils_test.py
+++ b/pack_utils_test.py
@@ -8,7 +8,6 @@
 if __name__=="__main__":
     torch.set_printoptions(precision=4)
     b = torch.rand((1, 1), dtype=torch.float32)
-    print(b)
     # save b
     save_file({'wb1': b}, '.cache/original_b.safetensor')
     quantizer = Quantizer()
diff --git a/ppl_res.json b/ppl_res.json
new file mode 100644
index 0000000..f86c4f0
--- /dev/null
+++ b/ppl_res.json
@@ -0,0 +1 @@
+{"fact_verification": 7.487515449523926}
\ No newline at end of file
diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh
index 07e84be..ec3d1f2 100644
--- a/scripts/gptq_delta.sh
+++ b/scripts/gptq_delta.sh
@@ -1,7 +1,7 @@
 python opt_delta_autotune.py \
-    --dataset wikitext2 \
-    --base-model facebook/opt-350m \
-    --model lnair/opt-350m-wikitext2 \
+    --dataset answer_verification \
+    --base-model facebook/opt-1.3b \
+    --model .cache/models/answer_verification \
     --delta \
     --wbits 2 \
     --tol 2 \