From 3b2997968a24050f00d5102ac32dbebb66ba0c78 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Tue, 9 May 2023 15:52:43 +0000 Subject: [PATCH 01/23] better decomposition --- cli.py | 2 +- core_compression.py | 4 +-- decomposition.py | 60 +++++++++++++++++++++++++++++++++++++++++++++ matq.py | 13 +++++----- scripts/lr_quant.sh | 20 +++++++++++++++ 5 files changed, 90 insertions(+), 9 deletions(-) create mode 100644 decomposition.py diff --git a/cli.py b/cli.py index b1bf8df..96eae8b 100644 --- a/cli.py +++ b/cli.py @@ -81,6 +81,6 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s ) if args.save: save_lr_tensors(lr_tensors, f"outputs/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors") - + ppl = opt_eval(target_model, loader_enc, args, target_model.device) logger.info(f"Perplexity: {ppl}") \ No newline at end of file diff --git a/core_compression.py b/core_compression.py index 2b445f7..f69cbba 100644 --- a/core_compression.py +++ b/core_compression.py @@ -1,9 +1,9 @@ import torch import torch.nn as nn -from loguru import logger -from modelutils import find_layers from matq import TensorQ +from loguru import logger from quant import Quantizer +from modelutils import find_layers @torch.no_grad() def opt_delta_lr( diff --git a/decomposition.py b/decomposition.py new file mode 100644 index 0000000..ec531af --- /dev/null +++ b/decomposition.py @@ -0,0 +1,60 @@ +import torch +import time +from loguru import logger + +def pca_decomposition(matrix, rank): + U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=5) + return U, torch.diag_embed(S) @ Vh.T +def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5): + # Initialize random matrices U and V + m, n = matrix.shape + U = torch.rand(m, rank) + V = torch.rand(rank, n) + U.to(matrix.device) + V.to(matrix.device) + tick = time.time() + early_stop = False + for i in range(max_iterations): + # Calculate the difference between the original and reconstructed matrices + difference = matrix - U @ V + + # Calculate the gradients + gradient_U = -2 * (difference @ V.T) + gradient_V = -2 * (U.T @ difference) + U -= learning_rate * gradient_U + V -= learning_rate * gradient_V + if torch.norm(difference) < tolerance: + early_stop = True + break + if not early_stop: + logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}") + else: + logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}") + return U, V + +if __name__=="__main__": + matrix = torch.rand((2048,2048)) + print("Original matrix:") + print(matrix) + rank = 32 + U, V = low_rank_decomposition( + matrix, + rank, + learning_rate=1e-6, + max_iterations=100000, + ) + U_pca, V_pca = pca_decomposition(matrix, rank) + reconstructed_matrix_pca = U_pca @ V_pca + + # print("U:") + # print(U) + # print("V:") + # print(V) + + reconstructed_matrix = U @ V + print("Reconstructed matrix:") + print(reconstructed_matrix) + + print("difference:") + print(torch.norm(matrix - reconstructed_matrix)) + print(torch.norm(matrix - reconstructed_matrix_pca)) \ No newline at end of file diff --git a/matq.py b/matq.py index 21ab6bc..030d06f 100644 --- a/matq.py +++ b/matq.py @@ -5,7 +5,7 @@ import transformers from loguru import logger from quant import quantize - +from decomposition import low_rank_decomposition DEBUG = False @@ -84,11 +84,12 @@ def decompose(self): W = W.float() logger.info("starting decomposition") tick = time.time() - U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5) - # let's say L = U - # and R = diag(S)*V.T - self.L = U - self.R = torch.diag_embed(S) @ Vh.T + # U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5) + # # let's say L = U + # # and R = diag(S)*V.T + # self.L = U + # self.R = torch.diag_embed(S) @ Vh.T + self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=100000) logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}") def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32): diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh index ccfc5a5..9503a9b 100644 --- a/scripts/lr_quant.sh +++ b/scripts/lr_quant.sh @@ -6,4 +6,24 @@ python cli.py \ --rank 32 \ --save outputs/ \ --nsamples 128 \ + --wbits 8 + +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 128 \ + --save outputs/ \ + --nsamples 128 \ + --wbits 8 + +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 256 \ + --save outputs/ \ + --nsamples 128 \ --wbits 8 \ No newline at end of file From f4dfd8292cc751e6c2447292aa9bf38efdda16b1 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Tue, 9 May 2023 18:36:42 +0000 Subject: [PATCH 02/23] gradient descent decomposition --- cli.py | 2 +- core_compression.py | 5 +++-- decomposition.py | 14 ++++++-------- matq.py | 10 +++++----- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/cli.py b/cli.py index 96eae8b..dece358 100644 --- a/cli.py +++ b/cli.py @@ -80,7 +80,7 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s args.nsamples ) if args.save: - save_lr_tensors(lr_tensors, f"outputs/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors") + save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors") ppl = opt_eval(target_model, loader_enc, args, target_model.device) logger.info(f"Perplexity: {ppl}") \ No newline at end of file diff --git a/core_compression.py b/core_compression.py index f69cbba..77098ea 100644 --- a/core_compression.py +++ b/core_compression.py @@ -55,6 +55,7 @@ def forward(self, inp, **kwargs): except ValueError: pass layers[0] = layers[0].module + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: @@ -108,10 +109,10 @@ def temp(_, inp, out): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - + for h in handles: h.remove() - + for name in subset: logger.info(f"Quantizing {name}...") lr_gptq[name].lr_quant( diff --git a/decomposition.py b/decomposition.py index ec531af..d5068b8 100644 --- a/decomposition.py +++ b/decomposition.py @@ -3,15 +3,14 @@ from loguru import logger def pca_decomposition(matrix, rank): - U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=5) + U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=500) return U, torch.diag_embed(S) @ Vh.T + def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5): # Initialize random matrices U and V m, n = matrix.shape - U = torch.rand(m, rank) - V = torch.rand(rank, n) - U.to(matrix.device) - V.to(matrix.device) + U = torch.rand(m, rank, device=matrix.device) + V = torch.rand(rank, n, device=matrix.device) tick = time.time() early_stop = False for i in range(max_iterations): @@ -26,6 +25,7 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, if torch.norm(difference) < tolerance: early_stop = True break + if not early_stop: logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}") else: @@ -33,9 +33,7 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, return U, V if __name__=="__main__": - matrix = torch.rand((2048,2048)) - print("Original matrix:") - print(matrix) + matrix = torch.rand((128,128)) rank = 32 U, V = low_rank_decomposition( matrix, diff --git a/matq.py b/matq.py index 030d06f..13dbbf5 100644 --- a/matq.py +++ b/matq.py @@ -92,14 +92,14 @@ def decompose(self): self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=100000) logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}") - def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32): - self.lr_quant_R(blocksize, percdamp, groupsize, actorder, rank) - self.lr_quant_L(blocksize, percdamp, groupsize, actorder, rank) + def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False): + self.lr_quant_R(blocksize, percdamp, groupsize, actorder) + self.lr_quant_L(blocksize, percdamp, groupsize, actorder) # restored weight is L@R # but on disk we only save L, R self.layer.weight.data = (self.L @ self.R).reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) - def lr_quant_R(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32): + def lr_quant_R(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False): R = self.R.data.clone() if isinstance(self.layer, nn.Conv2d): R = R.flatten(1) @@ -186,7 +186,7 @@ def lr_quant_R(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, Q_R = Q_R.t() self.R = Q_R.reshape(self.R.shape).to(self.R.dtype) - def lr_quant_L(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, rank=32): + def lr_quant_L(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False): L = self.L.data.clone() if isinstance(self.layer, nn.Conv2d): L = L.flatten(1) From 5f00d37f4949a9fb3f99d4ec5e022dc44d72a29d Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Tue, 9 May 2023 18:43:31 +0000 Subject: [PATCH 03/23] updating decomposition --- decomposition.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/decomposition.py b/decomposition.py index d5068b8..5c90042 100644 --- a/decomposition.py +++ b/decomposition.py @@ -2,26 +2,29 @@ import time from loguru import logger -def pca_decomposition(matrix, rank): - U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=500) +def pca_decomposition(matrix, rank, niter=500): + U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=niter) return U, torch.diag_embed(S) @ Vh.T def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5): # Initialize random matrices U and V - m, n = matrix.shape - U = torch.rand(m, rank, device=matrix.device) - V = torch.rand(rank, n, device=matrix.device) + # m, n = matrix.shape + # let's choose a good start point? + # L, R = pca_decomposition(matrix, rank) + # random seems to work better generally + L = torch.rand((matrix.shape[0], rank)) + R = torch.rand((rank, matrix.shape[1])) tick = time.time() early_stop = False for i in range(max_iterations): # Calculate the difference between the original and reconstructed matrices - difference = matrix - U @ V + difference = matrix - L @ R # Calculate the gradients - gradient_U = -2 * (difference @ V.T) - gradient_V = -2 * (U.T @ difference) - U -= learning_rate * gradient_U - V -= learning_rate * gradient_V + gradient_L = -2 * (difference @ R.T) + gradient_R = -2 * (L.T @ difference) + L -= learning_rate * gradient_L + R -= learning_rate * gradient_R if torch.norm(difference) < tolerance: early_stop = True break @@ -30,26 +33,22 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}") else: logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}") - return U, V + return L, R if __name__=="__main__": - matrix = torch.rand((128,128)) - rank = 32 - U, V = low_rank_decomposition( + matrix = torch.rand((16,16)) + rank = 4 + L, R = low_rank_decomposition( matrix, rank, learning_rate=1e-6, max_iterations=100000, ) - U_pca, V_pca = pca_decomposition(matrix, rank) - reconstructed_matrix_pca = U_pca @ V_pca + L_pca, R_pca = pca_decomposition(matrix, rank) + reconstructed_matrix_pca = L_pca @ R_pca - # print("U:") - # print(U) - # print("V:") - # print(V) + reconstructed_matrix = L @ R - reconstructed_matrix = U @ V print("Reconstructed matrix:") print(reconstructed_matrix) From b080b3c6934e67d349a2d509ebd6b9a0287c699b Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Tue, 9 May 2023 20:16:25 +0000 Subject: [PATCH 04/23] testing on different decomposition --- core_compression.py | 4 +- core_compression_parallel.py | 143 +++++++++++++++++++++++++++++++++++ decomposition.py | 17 +++-- scripts/lr_quant.sh | 8 +- scripts/playground.ipynb | 53 ++++++++++++- to_hf.py | 2 +- 6 files changed, 211 insertions(+), 16 deletions(-) create mode 100644 core_compression_parallel.py diff --git a/core_compression.py b/core_compression.py index 77098ea..7d6b1f8 100644 --- a/core_compression.py +++ b/core_compression.py @@ -1,5 +1,6 @@ import torch import torch.nn as nn +from tqdm import tqdm from matq import TensorQ from loguru import logger from quant import Quantizer @@ -72,7 +73,8 @@ def forward(self, inp, **kwargs): quantizers = {} l_quantizers = {} lr_tensors = {} - for i in range(len(delta_layers)): + # parallelize this to allocate to multiple GPUs + for i in tqdm(range(len(delta_layers))): layer = delta_layers[i].to(device) original_layer = layers[i].to(device) diff --git a/core_compression_parallel.py b/core_compression_parallel.py new file mode 100644 index 0000000..fca3c7d --- /dev/null +++ b/core_compression_parallel.py @@ -0,0 +1,143 @@ +import torch +import torch.nn as nn +from tqdm import tqdm +from matq import TensorQ +from loguru import logger +from quant import Quantizer +from modelutils import find_layers +import multiprocessing as mp +@torch.no_grad() +def opt_delta_lr( + model, + delta_model, + dataloader, + nsamples, + wbits, + sym, + trits, + rank, + args + ): + device = model.device + print("Starting LR quantizer initialization...") + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + delta_layers = delta_model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(device) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(device) + + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(device) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(device) + layers[0] = layers[0].to(device) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=device + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(device)) + except ValueError: + pass + layers[0] = layers[0].module + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + logger.info("Ready, creating lr quantizers...") + quantizers = {} + l_quantizers = {} + lr_tensors = {} + # parallelize this to allocate to multiple GPUs + def process_layer(i, device): + layer = delta_layers[i].to(device) + original_layer = layers[i].to(device) + subset = find_layers(layer) + lr_gptq = {} + for name in subset: + lr_gptq[name] = TensorQ(subset[name], rank) + lr_gptq[name].quantizer = Quantizer() + lr_gptq[name].quantizer.configure( + wbits, + perchannel=True, + sym=sym, + mse=False, + trits = trits, + ) + lr_gptq[name].l_quantizer = Quantizer() + lr_gptq[name].l_quantizer.configure( + wbits, + perchannel=True, + sym=sym, + mse=False, + trits = trits, + ) + def add_batch(name): + def temp(_, inp, out): + lr_gptq[name].add_batch_lr(inp[0].data, out.data) + return temp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + for h in handles: + h.remove() + + for name in subset: + logger.info(f"Quantizing {name}...") + lr_gptq[name].lr_quant( + percdamp = args['percdamp'], + groupsize = args['groupsize'], + actorder = args['actorder'], + ) + lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R + lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L + + quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].quantizer + l_quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].l_quantizer + lr_gptq[name].free() + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + del lr_gptq + torch.cuda.empty_cache() + inps, outs = original_outs, inps + num_workers = torch.cuda.device_count() + logger.info(f"Using {num_workers} workers...") + with mp.Pool(num_workers) as p: + p.starmap(process_layer, [(i, f'cuda:{i}') for i in range(num_workers)]) + + model.config.use_cache = use_cache + return quantizers, l_quantizers, lr_tensors \ No newline at end of file diff --git a/decomposition.py b/decomposition.py index 5c90042..b2df2ef 100644 --- a/decomposition.py +++ b/decomposition.py @@ -2,9 +2,9 @@ import time from loguru import logger -def pca_decomposition(matrix, rank, niter=500): - U, S, Vh = torch.pca_lowrank(matrix, q=rank, center=True, niter=niter) - return U, torch.diag_embed(S) @ Vh.T +def svd_decomposition(matrix, rank, niter=500): + U, S, Vh = torch.svd_lowrank(matrix, q=rank) + return U @ torch.diag_embed(S), Vh.T def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5): # Initialize random matrices U and V @@ -36,21 +36,26 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, return L, R if __name__=="__main__": - matrix = torch.rand((16,16)) - rank = 4 + matrix = torch.rand((1024,1024)) + # matrix = torch.tensor([[1., 2., 3., 4.],[5., 6., 7., 8.],[9., 10., 11., 12.],[13., 14., 15., 16.]]) + rank = 32 + print("Original matrix:") + print(matrix) L, R = low_rank_decomposition( matrix, rank, learning_rate=1e-6, max_iterations=100000, ) - L_pca, R_pca = pca_decomposition(matrix, rank) + L_pca, R_pca = svd_decomposition(matrix, rank) reconstructed_matrix_pca = L_pca @ R_pca reconstructed_matrix = L @ R print("Reconstructed matrix:") print(reconstructed_matrix) + print("Reconstructed matrix (pca):") + print(reconstructed_matrix_pca) print("difference:") print(torch.norm(matrix - reconstructed_matrix)) diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh index 9503a9b..6df8ed9 100644 --- a/scripts/lr_quant.sh +++ b/scripts/lr_quant.sh @@ -3,7 +3,7 @@ python cli.py \ --target-model lnair/opt-1.3b-wikitext2 \ --base-model facebook/opt-1.3b \ --delta \ - --rank 32 \ + --rank 128 \ --save outputs/ \ --nsamples 128 \ --wbits 8 @@ -13,7 +13,7 @@ python cli.py \ --target-model lnair/opt-1.3b-wikitext2 \ --base-model facebook/opt-1.3b \ --delta \ - --rank 128 \ + --rank 256 \ --save outputs/ \ --nsamples 128 \ --wbits 8 @@ -23,7 +23,7 @@ python cli.py \ --target-model lnair/opt-1.3b-wikitext2 \ --base-model facebook/opt-1.3b \ --delta \ - --rank 256 \ + --rank 128 \ --save outputs/ \ --nsamples 128 \ - --wbits 8 \ No newline at end of file + --wbits 4 \ No newline at end of file diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb index 748f8cb..a6648c0 100644 --- a/scripts/playground.ipynb +++ b/scripts/playground.ipynb @@ -2,22 +2,67 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ + "import sys\n", "seed=42\n", "target_model_name = \"lnair/opt-1.3b-wikitext2\"\n", "base_model_name = \"facebook/opt-1.3b\"\n", "n_samples = 128\n", - "dataset = 'wikitext2'" + "dataset = 'wikitext2'\n", + "sys.path.append('..')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "data": { + "text/plain": [ + "OPTForCausalLM(\n", + " (model): OPTModel(\n", + " (decoder): OPTDecoder(\n", + " (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n", + " (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n", + " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", + " (layers): ModuleList(\n", + " (0-23): 24 x OPTDecoderLayer(\n", + " (self_attn): OPTAttention(\n", + " (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", + " (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", + " (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", + " (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", + " )\n", + " (activation_fn): ReLU()\n", + " (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", + " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n", + " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n", + " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n", + ")" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from modelutils import get_opt\n", "base_model = get_opt(base_model_name)\n", diff --git a/to_hf.py b/to_hf.py index 10e0be6..e47318e 100644 --- a/to_hf.py +++ b/to_hf.py @@ -12,7 +12,7 @@ from copy import deepcopy target_model = deepcopy(base_model) -tensors = load_lr_tensors("outputs/model.safetensors") +tensors = load_lr_tensors("outputs/lnair.opt-1.3b-wikitext2-r32-w8-lr.safetensors") target_layers = target_model.model.decoder.layers From 02eee606d694b0f65798f9c89146577272d5fcc5 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Wed, 10 May 2023 06:58:44 +0000 Subject: [PATCH 05/23] now decomposition takes input into account --- decomposition.py | 75 +++++++++++++++++++++++++++---------------- lr_only.py | 0 scripts/lr_quant_2.sh | 29 +++++++++++++++++ to_hf.py | 8 ++--- 4 files changed, 81 insertions(+), 31 deletions(-) create mode 100644 lr_only.py create mode 100644 scripts/lr_quant_2.sh diff --git a/decomposition.py b/decomposition.py index b2df2ef..535f684 100644 --- a/decomposition.py +++ b/decomposition.py @@ -3,32 +3,47 @@ from loguru import logger def svd_decomposition(matrix, rank, niter=500): - U, S, Vh = torch.svd_lowrank(matrix, q=rank) + U, S, Vh = torch.pca_lowrank(matrix, q=rank) return U @ torch.diag_embed(S), Vh.T -def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5): +def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, input_matrix = None): # Initialize random matrices U and V # m, n = matrix.shape # let's choose a good start point? # L, R = pca_decomposition(matrix, rank) # random seems to work better generally - L = torch.rand((matrix.shape[0], rank)) - R = torch.rand((rank, matrix.shape[1])) + L = torch.rand((matrix.shape[0], rank), device=matrix.device) + R = torch.rand((rank, matrix.shape[1]), device=matrix.device) + tick = time.time() early_stop = False - for i in range(max_iterations): - # Calculate the difference between the original and reconstructed matrices - difference = matrix - L @ R - - # Calculate the gradients - gradient_L = -2 * (difference @ R.T) - gradient_R = -2 * (L.T @ difference) - L -= learning_rate * gradient_L - R -= learning_rate * gradient_R - if torch.norm(difference) < tolerance: - early_stop = True - break - + if input_matrix is None: + for i in range(max_iterations): + # Calculate the difference between the original and reconstructed matrices + difference = matrix - L @ R + + # Calculate the gradients + gradient_L = -2 * (difference @ R.T) + gradient_R = -2 * (L.T @ difference) + L -= learning_rate * gradient_L + R -= learning_rate * gradient_R + if torch.norm(difference) < tolerance: + early_stop = True + break + else: + W = matrix + X = input_matrix + for i in range(max_iterations): + + gradient_L = -2 * (matrix@X - L @ (R @ X)) @ ((R@X).T) + # gradient_L = -2 * np.dot((WX - np.dot(U, np.dot(V, X))), np.dot(V, X).T) + gradient_R = -2 * (L.T @ (W@X - L @ (R @ X))) + # gradient_V = -2 * np.dot(U.T, (WX - np.dot(U, np.dot(V, X)))) + L -= learning_rate * gradient_L + R -= learning_rate * gradient_R + if torch.norm(W@X - L @ (R @ X)) < tolerance: + early_stop = True + break if not early_stop: logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}") else: @@ -38,25 +53,31 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, if __name__=="__main__": matrix = torch.rand((1024,1024)) # matrix = torch.tensor([[1., 2., 3., 4.],[5., 6., 7., 8.],[9., 10., 11., 12.],[13., 14., 15., 16.]]) - rank = 32 - print("Original matrix:") + input_matrix = torch.rand((1024,16)) + print(matrix) + rank = 32 + print("Original output:") + original_output = matrix @ input_matrix + print(original_output) L, R = low_rank_decomposition( matrix, rank, learning_rate=1e-6, max_iterations=100000, ) - L_pca, R_pca = svd_decomposition(matrix, rank) - reconstructed_matrix_pca = L_pca @ R_pca + # L_pca, R_pca = svd_decomposition(matrix, rank) + # reconstructed_matrix_pca = L_pca @ R_pca + print(L.shape) + print(R.shape) - reconstructed_matrix = L @ R + reconstructed_matrix = L @ R @ input_matrix - print("Reconstructed matrix:") + print("Reconstructed output:") print(reconstructed_matrix) - print("Reconstructed matrix (pca):") - print(reconstructed_matrix_pca) + # print("Reconstructed matrix (pca):") + # print(reconstructed_matrix_pca) print("difference:") - print(torch.norm(matrix - reconstructed_matrix)) - print(torch.norm(matrix - reconstructed_matrix_pca)) \ No newline at end of file + print(torch.norm(original_output - reconstructed_matrix)) + #print(torch.norm(matrix - reconstructed_matrix_pca)) \ No newline at end of file diff --git a/lr_only.py b/lr_only.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/lr_quant_2.sh b/scripts/lr_quant_2.sh new file mode 100644 index 0000000..19865ec --- /dev/null +++ b/scripts/lr_quant_2.sh @@ -0,0 +1,29 @@ +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 512 \ + --save outputs/ \ + --nsamples 128 \ + --wbits 8 + +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 512 \ + --save outputs/ \ + --nsamples 128 \ + --wbits 4 + +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 1024 \ + --save outputs/ \ + --nsamples 128 \ + --wbits 8 \ No newline at end of file diff --git a/to_hf.py b/to_hf.py index e47318e..155b349 100644 --- a/to_hf.py +++ b/to_hf.py @@ -11,8 +11,8 @@ from modelutils import find_layers from copy import deepcopy target_model = deepcopy(base_model) - -tensors = load_lr_tensors("outputs/lnair.opt-1.3b-wikitext2-r32-w8-lr.safetensors") +MODEL_ID = "lnair.opt-1.3b-wikitext2-r128-w8-lr" +tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors") target_layers = target_model.model.decoder.layers @@ -27,6 +27,6 @@ layer[layer_id].weight.data = new_weight # save target model as HF -target_model.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8") +target_model.save_pretrained(f"outputs/{MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(base_model_name) -tokenizer.save_pretrained("outputs/lnair-opt-1.3b-wikitext2-r32-w8") \ No newline at end of file +tokenizer.save_pretrained(f"outputs/{MODEL_ID}") \ No newline at end of file From 1eb5a11bc7f1174f6640cbbafc8fb03e4e41fd20 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Wed, 10 May 2023 08:09:18 +0000 Subject: [PATCH 06/23] better decomposition --- core_compression.py | 2 +- decomposition.py | 59 ++++++++++++++++++++++++++++++++------------- gptq.py | 2 +- lr_only.py | 42 ++++++++++++++++++++++++++++++++ matq.py | 15 +++++++----- scripts/lr_quant.sh | 24 ++---------------- 6 files changed, 97 insertions(+), 47 deletions(-) diff --git a/core_compression.py b/core_compression.py index 7d6b1f8..93fb548 100644 --- a/core_compression.py +++ b/core_compression.py @@ -73,7 +73,7 @@ def forward(self, inp, **kwargs): quantizers = {} l_quantizers = {} lr_tensors = {} - # parallelize this to allocate to multiple GPUs + # parallelize this to allocate to multiple GPUs? for i in tqdm(range(len(delta_layers))): layer = delta_layers[i].to(device) original_layer = layers[i].to(device) diff --git a/decomposition.py b/decomposition.py index 535f684..a183566 100644 --- a/decomposition.py +++ b/decomposition.py @@ -1,5 +1,6 @@ -import torch import time +from tqdm import tqdm +import torch from loguru import logger def svd_decomposition(matrix, rank, niter=500): @@ -34,10 +35,10 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, W = matrix X = input_matrix for i in range(max_iterations): - - gradient_L = -2 * (matrix@X - L @ (R @ X)) @ ((R@X).T) + diff = W@X - L @ R @ X + gradient_L = -2 * (diff @ ((R@X).T)) # gradient_L = -2 * np.dot((WX - np.dot(U, np.dot(V, X))), np.dot(V, X).T) - gradient_R = -2 * (L.T @ (W@X - L @ (R @ X))) + gradient_R = -2 * (L.T @ diff @ X.T) # gradient_V = -2 * np.dot(U.T, (WX - np.dot(U, np.dot(V, X)))) L -= learning_rate * gradient_L R -= learning_rate * gradient_R @@ -50,13 +51,26 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}") return L, R +def torch_autograd(W, X, rank, lr, steps): + L = torch.rand((W.shape[0], rank), device=W.device, requires_grad=True) + R = torch.rand((rank, W.shape[1]), device=W.device, requires_grad=True) + optimizer = torch.optim.SGD([L, R], lr=lr) + for _ in tqdm(range(steps)): + optimizer.zero_grad() + output = L @ R @ X + target = W @ X + loss = torch.nn.functional.mse_loss(output, target) + loss.backward() + optimizer.step() + return L, R + if __name__=="__main__": - matrix = torch.rand((1024,1024)) - # matrix = torch.tensor([[1., 2., 3., 4.],[5., 6., 7., 8.],[9., 10., 11., 12.],[13., 14., 15., 16.]]) - input_matrix = torch.rand((1024,16)) + #matrix = torch.rand((1024,1024)) + matrix = torch.rand((16, 16)) + input_matrix = torch.rand((16,2)) print(matrix) - rank = 32 + rank = 4 print("Original output:") original_output = matrix @ input_matrix print(original_output) @@ -64,20 +78,31 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, matrix, rank, learning_rate=1e-6, - max_iterations=100000, + max_iterations=1000, + input_matrix=input_matrix ) - # L_pca, R_pca = svd_decomposition(matrix, rank) - # reconstructed_matrix_pca = L_pca @ R_pca - print(L.shape) - print(R.shape) + L_noinput, R_noinput = low_rank_decomposition( + matrix, + rank, + learning_rate=1e-6, + max_iterations=1000, + ) + L_autograd, R_autograd = torch_autograd(matrix, input_matrix, rank, 1e-6, 1000) + # # L_pca, R_pca = svd_decomposition(matrix, rank) + # # reconstructed_matrix_pca = L_pca @ R_pca - reconstructed_matrix = L @ R @ input_matrix + reconstructed_matrix = L @ R @ input_matrix + reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix + reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix print("Reconstructed output:") print(reconstructed_matrix) - # print("Reconstructed matrix (pca):") - # print(reconstructed_matrix_pca) + print("Reconstructed matrix (autograd):") + print(reconstructed_matrix_pca) + print("Reconstructed matrix (noinput):") + print(reconstructed_matrix_noinput) print("difference:") print(torch.norm(original_output - reconstructed_matrix)) - #print(torch.norm(matrix - reconstructed_matrix_pca)) \ No newline at end of file + print(torch.norm(original_output - reconstructed_matrix_pca)) + print(torch.norm(original_output - reconstructed_matrix_noinput)) \ No newline at end of file diff --git a/gptq.py b/gptq.py index 2477cac..8f719e1 100644 --- a/gptq.py +++ b/gptq.py @@ -152,4 +152,4 @@ def free(self): self.H = None self.Losses = None self.Trace = None - torch.cuda.empty_cache() + torch.cuda.empty_cache() \ No newline at end of file diff --git a/lr_only.py b/lr_only.py index e69de29..dc392c2 100644 --- a/lr_only.py +++ b/lr_only.py @@ -0,0 +1,42 @@ +import copy +import torch +import argparse +import torch.nn as nn +from loguru import logger +from evaluation import opt_eval +from datautils import get_loaders +from core_compression import opt_delta_lr +from modelutils import get_opt, find_layers +from save_and_load import save_lr_tensors, load_lr_tensors + +@torch.no_grad() +def lowrank_decomposition(model, rank, n_samples, data_loader=None): + lr_iopairs = {} + + def add_batch(name): + def temp(_, inp, out): + lr_iopairs[name] = (inp, out) + return temp + layers = model.model.decoder.layers + inps = torch.zeros( + (n_samples, model.seqlen, model.config.hidden_size), dtype=torch.fp16, device=model.device + ) + handles = [] + for i in range(len(layers)): + subset = find_layers(layers[i]) + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(f"decoder.layers.{i}.{name}"))) + layer_id = f"decoder.layers.{i}.{name}" + decomposing_layer = subset[name].weight + # decompose this into low rank matrices + +if __name__=="__main__": + base_model = get_opt('facebook/opt-1.3b') + trainloader, loader_enc = get_loaders( + 'wikitext2', + nsamples = 128, + seed=42, + model='facebook/opt-1.3b', + seqlen=base_model.seqlen, + ) + lowrank_decomposition(base_model, 32, 128, trainloader) \ No newline at end of file diff --git a/matq.py b/matq.py index 13dbbf5..f78aa35 100644 --- a/matq.py +++ b/matq.py @@ -22,19 +22,19 @@ def __init__(self, layer, rank=32): if isinstance(self.layer, transformers.Conv1D): W = W.t() self.rank = rank - self.decompose() + # self.decompose() self.rows = W.shape[0] self.columns = W.shape[1] - self.L_columns = self.L.shape[1] + self.L_columns = rank self.H = torch.zeros((self.columns, self.columns), device=self.dev) self.H_R = torch.zeros((self.columns, self.columns), device=self.dev) self.H_L = torch.zeros((self.L_columns, self.L_columns), device=self.dev) self.nsamples = 0 def add_batch_lr(self, inp, out): - if DEBUG: - self.inp1 = inp - self.out1 = out + #if DEBUG: + # self.inp1 = inp + # self.out1 = out if len(inp.shape) == 2: inp = inp.unsqueeze(0) tmp = inp.shape[0] @@ -54,8 +54,9 @@ def add_batch_lr(self, inp, out): inp = inp.flatten(1) self.H_R *= self.nsamples / (self.nsamples + tmp) self.nsamples += tmp - inp = math.sqrt(2 / self.nsamples) * inp.float() + inp = math.sqrt(2 / self.nsamples) * inp.float() + self.inp = inp self.H_R += inp.matmul(inp.t()) # logger.info(f"self.H_R: {self.H_R.shape}") # for L, consider the input to be R@X @@ -76,6 +77,7 @@ def free(self): torch.cuda.empty_cache() def decompose(self): + print(self.inp.shape) W = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): W = W.flatten(1) @@ -93,6 +95,7 @@ def decompose(self): logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}") def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False): + self.decompose() self.lr_quant_R(blocksize, percdamp, groupsize, actorder) self.lr_quant_L(blocksize, percdamp, groupsize, actorder) # restored weight is L@R diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh index 6df8ed9..b716288 100644 --- a/scripts/lr_quant.sh +++ b/scripts/lr_quant.sh @@ -3,27 +3,7 @@ python cli.py \ --target-model lnair/opt-1.3b-wikitext2 \ --base-model facebook/opt-1.3b \ --delta \ - --rank 128 \ + --rank 16 \ --save outputs/ \ --nsamples 128 \ - --wbits 8 - -python cli.py \ - --dataset wikitext2 \ - --target-model lnair/opt-1.3b-wikitext2 \ - --base-model facebook/opt-1.3b \ - --delta \ - --rank 256 \ - --save outputs/ \ - --nsamples 128 \ - --wbits 8 - -python cli.py \ - --dataset wikitext2 \ - --target-model lnair/opt-1.3b-wikitext2 \ - --base-model facebook/opt-1.3b \ - --delta \ - --rank 128 \ - --save outputs/ \ - --nsamples 128 \ - --wbits 4 \ No newline at end of file + --wbits 8 \ No newline at end of file From 31e72198bb4a87160c7d0208ba67a42536f94baa Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Wed, 10 May 2023 10:16:27 +0000 Subject: [PATCH 07/23] better decomposition --- cli.py | 12 ++++++--- core_compression.py | 8 +++--- decomposition.py | 10 ++------ matq.py | 58 ++++++++++++++++--------------------------- scripts/lr_quant.sh | 11 ++++++++ scripts/lr_quant_2.sh | 15 +++-------- to_hf.py | 2 +- 7 files changed, 51 insertions(+), 65 deletions(-) diff --git a/cli.py b/cli.py index dece358..2159fb5 100644 --- a/cli.py +++ b/cli.py @@ -10,7 +10,7 @@ from core_compression import opt_delta_lr @torch.no_grad() -def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples): +def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples, decompose_only=False): # first do low rank approximation # then quantize original_finetuned_model = copy.deepcopy(target_model) @@ -29,7 +29,8 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s 'percdamp': 0.01, 'groupsize': -1, 'actorder': False, - } + }, + decompose_only=decompose_only ) target_model.to(base_model.device) @@ -53,9 +54,11 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s argparser.add_argument('--save', type=str, default='', help='Path to save the quantized model') argparser.add_argument('--wbits', type=int, default=8, help='Number of bits to use for quantization') argparser.add_argument('--sym', action='store_true', default=True, help='Whether to use symmetric quantization') + argparser.add_argument('--decompose-only', action='store_true', default=False, help='Whether to use quantization') argparser.add_argument('--trits', action='store_true', default=False, help='Whether to use trits') args = argparser.parse_args() + print(args) seed = args.seed base_model = get_opt(args.base_model) @@ -77,10 +80,11 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s trainloader, args.rank, args.wbits, - args.nsamples + args.nsamples, + args.decompose_only, ) if args.save: - save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors") + save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-decompose.{args.decompose_only}-lr.safetensors") ppl = opt_eval(target_model, loader_enc, args, target_model.device) logger.info(f"Perplexity: {ppl}") \ No newline at end of file diff --git a/core_compression.py b/core_compression.py index 93fb548..6e79ba4 100644 --- a/core_compression.py +++ b/core_compression.py @@ -16,7 +16,8 @@ def opt_delta_lr( sym, trits, rank, - args + args, + decompose_only=False, ): device = model.device print("Starting LR quantizer initialization...") @@ -49,6 +50,7 @@ def forward(self, inp, **kwargs): cache['i'] += 1 cache['attention_mask'] = kwargs['attention_mask'] raise ValueError + layers[0] = Catcher(layers[0]) for batch in dataloader: try: @@ -64,7 +66,6 @@ def forward(self, inp, **kwargs): if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: model.model.decoder.project_in = model.model.decoder.project_in.cpu() torch.cuda.empty_cache() - outs = torch.zeros_like(inps) original_outs = torch.zeros_like(inps) attention_mask = cache['attention_mask'] @@ -81,7 +82,7 @@ def forward(self, inp, **kwargs): subset = find_layers(layer) lr_gptq = {} for name in subset: - lr_gptq[name] = TensorQ(subset[name], rank) + lr_gptq[name] = TensorQ(subset[name], rank, sensitive_decompose=True) lr_gptq[name].quantizer = Quantizer() lr_gptq[name].quantizer.configure( wbits, @@ -121,6 +122,7 @@ def temp(_, inp, out): percdamp=args['percdamp'], groupsize=args['groupsize'], actorder=args['actorder'], + decompose_only=decompose_only, ) lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L diff --git a/decomposition.py b/decomposition.py index a183566..4099728 100644 --- a/decomposition.py +++ b/decomposition.py @@ -1,6 +1,6 @@ import time -from tqdm import tqdm import torch +from tqdm import tqdm from loguru import logger def svd_decomposition(matrix, rank, niter=500): @@ -22,7 +22,6 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, for i in range(max_iterations): # Calculate the difference between the original and reconstructed matrices difference = matrix - L @ R - # Calculate the gradients gradient_L = -2 * (difference @ R.T) gradient_R = -2 * (L.T @ difference) @@ -37,18 +36,13 @@ def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, for i in range(max_iterations): diff = W@X - L @ R @ X gradient_L = -2 * (diff @ ((R@X).T)) - # gradient_L = -2 * np.dot((WX - np.dot(U, np.dot(V, X))), np.dot(V, X).T) gradient_R = -2 * (L.T @ diff @ X.T) - # gradient_V = -2 * np.dot(U.T, (WX - np.dot(U, np.dot(V, X)))) L -= learning_rate * gradient_L R -= learning_rate * gradient_R if torch.norm(W@X - L @ (R @ X)) < tolerance: early_stop = True break - if not early_stop: - logger.warning(f"Low rank decomposition did not converge. Elapsed time: {time.time() - tick}") - else: - logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}") + logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {torch.norm(W@X - L @ (R @ X))}") return L, R def torch_autograd(W, X, rank, lr, steps): diff --git a/matq.py b/matq.py index f78aa35..9a40374 100644 --- a/matq.py +++ b/matq.py @@ -13,7 +13,7 @@ torch.backends.cudnn.allow_tf32 = False class TensorQ: - def __init__(self, layer, rank=32): + def __init__(self, layer, rank=32, sensitive_decompose=False): self.layer = layer self.dev = self.layer.weight.device W = layer.weight.data.clone() @@ -22,7 +22,8 @@ def __init__(self, layer, rank=32): if isinstance(self.layer, transformers.Conv1D): W = W.t() self.rank = rank - # self.decompose() + if not sensitive_decompose: + self.decompose() self.rows = W.shape[0] self.columns = W.shape[1] self.L_columns = rank @@ -37,33 +38,24 @@ def add_batch_lr(self, inp, out): # self.out1 = out if len(inp.shape) == 2: inp = inp.unsqueeze(0) - tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + self.tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear): if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() - if isinstance(self.layer, nn.Conv2d): - unfold = nn.Unfold( - self.layer.kernel_size, - dilation=self.layer.dilation, - padding=self.layer.padding, - stride=self.layer.stride - ) - inp = unfold(inp) - inp = inp.permute([1, 0, 2]) - inp = inp.flatten(1) - self.H_R *= self.nsamples / (self.nsamples + tmp) - self.nsamples += tmp - + + self.H_R *= self.nsamples / (self.nsamples + self.tmp) + self.nsamples += self.tmp inp = math.sqrt(2 / self.nsamples) * inp.float() self.inp = inp - self.H_R += inp.matmul(inp.t()) + + def calculate_hessian(self): + self.H_R += self.inp.matmul(self.inp.t()) # logger.info(f"self.H_R: {self.H_R.shape}") # for L, consider the input to be R@X - inp = self.R @ inp - self.H_L *= self.nsamples / (self.nsamples + tmp) - self.H_L += inp.matmul(inp.t()) - # logger.info(f"self.H_L: {self.H_L.shape}") + l_inp = self.R @ self.inp + self.H_L *= self.nsamples / (self.nsamples + self.tmp) + self.H_L += l_inp.matmul(l_inp.t()) def free(self): if DEBUG: @@ -77,28 +69,20 @@ def free(self): torch.cuda.empty_cache() def decompose(self): - print(self.inp.shape) W = self.layer.weight.data.clone() - if isinstance(self.layer, nn.Conv2d): - W = W.flatten(1) - if isinstance(self.layer, transformers.Conv1D): - W = W.t() W = W.float() logger.info("starting decomposition") tick = time.time() - # U, S, Vh = torch.pca_lowrank(W, q=self.rank, center=True, niter=5) - # # let's say L = U - # # and R = diag(S)*V.T - # self.L = U - # self.R = torch.diag_embed(S) @ Vh.T - self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=100000) + self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=10000, input_matrix=self.inp) logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}") - def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False): + def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, decompose_only=False): self.decompose() - self.lr_quant_R(blocksize, percdamp, groupsize, actorder) - self.lr_quant_L(blocksize, percdamp, groupsize, actorder) - # restored weight is L@R + if not decompose_only: + self.calculate_hessian() + self.lr_quant_R(blocksize, percdamp, groupsize, actorder) + self.lr_quant_L(blocksize, percdamp, groupsize, actorder) + # restored weight is L@R, we overwrite the weight for evaluation if needed # but on disk we only save L, R self.layer.weight.data = (self.L @ self.R).reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) diff --git a/scripts/lr_quant.sh b/scripts/lr_quant.sh index b716288..2665e7b 100644 --- a/scripts/lr_quant.sh +++ b/scripts/lr_quant.sh @@ -6,4 +6,15 @@ python cli.py \ --rank 16 \ --save outputs/ \ --nsamples 128 \ + --wbits 8 + +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-1.3b-wikitext2 \ + --base-model facebook/opt-1.3b \ + --delta \ + --rank 16 \ + --save outputs/ \ + --nsamples 128 \ + --decompose-only \ --wbits 8 \ No newline at end of file diff --git a/scripts/lr_quant_2.sh b/scripts/lr_quant_2.sh index 19865ec..2250e30 100644 --- a/scripts/lr_quant_2.sh +++ b/scripts/lr_quant_2.sh @@ -3,9 +3,10 @@ python cli.py \ --target-model lnair/opt-1.3b-wikitext2 \ --base-model facebook/opt-1.3b \ --delta \ - --rank 512 \ + --rank 32 \ --save outputs/ \ --nsamples 128 \ + --decompose-only \ --wbits 8 python cli.py \ @@ -13,17 +14,7 @@ python cli.py \ --target-model lnair/opt-1.3b-wikitext2 \ --base-model facebook/opt-1.3b \ --delta \ - --rank 512 \ - --save outputs/ \ - --nsamples 128 \ - --wbits 4 - -python cli.py \ - --dataset wikitext2 \ - --target-model lnair/opt-1.3b-wikitext2 \ - --base-model facebook/opt-1.3b \ - --delta \ - --rank 1024 \ + --rank 32 \ --save outputs/ \ --nsamples 128 \ --wbits 8 \ No newline at end of file diff --git a/to_hf.py b/to_hf.py index 155b349..2deb4a5 100644 --- a/to_hf.py +++ b/to_hf.py @@ -11,7 +11,7 @@ from modelutils import find_layers from copy import deepcopy target_model = deepcopy(base_model) -MODEL_ID = "lnair.opt-1.3b-wikitext2-r128-w8-lr" +MODEL_ID = "lnair.opt-1.3b-wikitext2-r256-w8-lr" tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors") target_layers = target_model.model.decoder.layers From 22507fba91a67b6170d27cfc576643b61f222b62 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Wed, 10 May 2023 12:03:40 +0000 Subject: [PATCH 08/23] mse metric --- decomposition.py | 81 +++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 46 deletions(-) diff --git a/decomposition.py b/decomposition.py index 4099728..8d925be 100644 --- a/decomposition.py +++ b/decomposition.py @@ -2,47 +2,45 @@ import torch from tqdm import tqdm from loguru import logger +import torch.nn.functional as F -def svd_decomposition(matrix, rank, niter=500): +def svd_decomposition(matrix, rank): U, S, Vh = torch.pca_lowrank(matrix, q=rank) return U @ torch.diag_embed(S), Vh.T -def low_rank_decomposition(matrix, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, input_matrix = None): - # Initialize random matrices U and V - # m, n = matrix.shape - # let's choose a good start point? - # L, R = pca_decomposition(matrix, rank) - # random seems to work better generally - L = torch.rand((matrix.shape[0], rank), device=matrix.device) - R = torch.rand((rank, matrix.shape[1]), device=matrix.device) - +def low_rank_decomposition(W, rank, learning_rate=0.01, max_iterations=500, tolerance=1e-5, X = None): + L = torch.rand((W.shape[0], rank), device=W.device) + R = torch.rand((rank, W.shape[1]), device=W.device) tick = time.time() early_stop = False - if input_matrix is None: + if X is None: for i in range(max_iterations): # Calculate the difference between the original and reconstructed matrices - difference = matrix - L @ R + diff_part1 = W + diff_part2 = L @ R + difference = W - L @ R # Calculate the gradients gradient_L = -2 * (difference @ R.T) gradient_R = -2 * (L.T @ difference) L -= learning_rate * gradient_L R -= learning_rate * gradient_R - if torch.norm(difference) < tolerance: + if F.mse_loss(diff_part1, diff_part2) < tolerance: early_stop = True break + logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}") else: - W = matrix - X = input_matrix for i in range(max_iterations): - diff = W@X - L @ R @ X + diff_part1 = W@X + diff_part2 = L @ R @ X + diff = diff_part1 - diff_part2 gradient_L = -2 * (diff @ ((R@X).T)) gradient_R = -2 * (L.T @ diff @ X.T) L -= learning_rate * gradient_L R -= learning_rate * gradient_R - if torch.norm(W@X - L @ (R @ X)) < tolerance: + if F.mse_loss(diff_part1, diff_part2) < tolerance: early_stop = True break - logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {torch.norm(W@X - L @ (R @ X))}") + logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}") return L, R def torch_autograd(W, X, rank, lr, steps): @@ -59,44 +57,35 @@ def torch_autograd(W, X, rank, lr, steps): return L, R if __name__=="__main__": - #matrix = torch.rand((1024,1024)) - matrix = torch.rand((16, 16)) - input_matrix = torch.rand((16,2)) + FULL_RANK = 128 + LOW_RANK = 16 + TARGET_SIZE = 2 + + W = torch.rand((FULL_RANK, FULL_RANK)) + input_matrix = torch.rand((FULL_RANK, TARGET_SIZE)) + output_matrix = W @ input_matrix - print(matrix) - rank = 4 print("Original output:") - original_output = matrix @ input_matrix - print(original_output) - L, R = low_rank_decomposition( - matrix, - rank, + print(output_matrix) + L_sensitive, R_sensitive = low_rank_decomposition( + W, + LOW_RANK, learning_rate=1e-6, max_iterations=1000, - input_matrix=input_matrix + X=input_matrix ) L_noinput, R_noinput = low_rank_decomposition( - matrix, - rank, + W, + LOW_RANK, learning_rate=1e-6, max_iterations=1000, ) - L_autograd, R_autograd = torch_autograd(matrix, input_matrix, rank, 1e-6, 1000) - # # L_pca, R_pca = svd_decomposition(matrix, rank) - # # reconstructed_matrix_pca = L_pca @ R_pca + L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-6, 1000) - - reconstructed_matrix = L @ R @ input_matrix + reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix - print("Reconstructed output:") - print(reconstructed_matrix) - print("Reconstructed matrix (autograd):") - print(reconstructed_matrix_pca) - print("Reconstructed matrix (noinput):") - print(reconstructed_matrix_noinput) - print("difference:") - print(torch.norm(original_output - reconstructed_matrix)) - print(torch.norm(original_output - reconstructed_matrix_pca)) - print(torch.norm(original_output - reconstructed_matrix_noinput)) \ No newline at end of file + print(f"gd: {F.mse_loss(output_matrix, reconstructed_matrix)}") + print(f"autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}") + print(f"noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}") From 005cb659edf1b086cb9267110245bcca6e9c914c Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Wed, 10 May 2023 14:30:36 +0000 Subject: [PATCH 09/23] more complex scheduler... --- decomposition.py | 67 ++++++++++++++++++++-------------------- matq.py | 4 +-- scripts/lr_quant_350m.sh | 10 ++++++ to_hf.py | 2 +- 4 files changed, 47 insertions(+), 36 deletions(-) create mode 100644 scripts/lr_quant_350m.sh diff --git a/decomposition.py b/decomposition.py index 8d925be..5663594 100644 --- a/decomposition.py +++ b/decomposition.py @@ -2,6 +2,7 @@ import torch from tqdm import tqdm from loguru import logger +from torch.optim.lr_scheduler import ExponentialLR import torch.nn.functional as F def svd_decomposition(matrix, rank): @@ -14,78 +15,78 @@ def low_rank_decomposition(W, rank, learning_rate=0.01, max_iterations=500, tole tick = time.time() early_stop = False if X is None: - for i in range(max_iterations): - # Calculate the difference between the original and reconstructed matrices - diff_part1 = W - diff_part2 = L @ R + for i in tqdm(range(max_iterations)): difference = W - L @ R - # Calculate the gradients gradient_L = -2 * (difference @ R.T) gradient_R = -2 * (L.T @ difference) L -= learning_rate * gradient_L R -= learning_rate * gradient_R - if F.mse_loss(diff_part1, diff_part2) < tolerance: + if F.mse_loss(W, L@R) < tolerance: early_stop = True break - logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}") + logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W, L@R)}") else: - for i in range(max_iterations): - diff_part1 = W@X - diff_part2 = L @ R @ X - diff = diff_part1 - diff_part2 + for i in tqdm(range(max_iterations)): + diff = W @ X - L @ R @ X gradient_L = -2 * (diff @ ((R@X).T)) gradient_R = -2 * (L.T @ diff @ X.T) L -= learning_rate * gradient_L R -= learning_rate * gradient_R - if F.mse_loss(diff_part1, diff_part2) < tolerance: + if F.mse_loss(W @ X, L @ R @ X) < tolerance: early_stop = True break - logger.info(f"Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(diff_part1, diff_part2)}") + # print(F.mse_loss(W @ X, L @ R @ X)) + logger.info(f"[With Input] Low rank decomposition done. Elapsed time: {time.time() - tick}. Early stop: {early_stop}, loss: {F.mse_loss(W@X, L@R@X)}") return L, R def torch_autograd(W, X, rank, lr, steps): L = torch.rand((W.shape[0], rank), device=W.device, requires_grad=True) R = torch.rand((rank, W.shape[1]), device=W.device, requires_grad=True) - optimizer = torch.optim.SGD([L, R], lr=lr) - for _ in tqdm(range(steps)): + optimizer = torch.optim.SGD([L, R], lr=lr, momentum=0.9) + scheduler = ExponentialLR(optimizer, gamma=0.9) + for j in tqdm(range(steps)): optimizer.zero_grad() output = L @ R @ X target = W @ X loss = torch.nn.functional.mse_loss(output, target) loss.backward() optimizer.step() + if j % 200 == 0: + scheduler.step() return L, R if __name__=="__main__": - FULL_RANK = 128 - LOW_RANK = 16 + FULL_RANK = 2048 + FULL_RANK_H = 1024 + FULL_RANK_W = 4096 + LOW_RANK = 32 TARGET_SIZE = 2 - W = torch.rand((FULL_RANK, FULL_RANK)) - input_matrix = torch.rand((FULL_RANK, TARGET_SIZE)) + W = torch.rand((FULL_RANK_W, FULL_RANK_H)) + input_matrix = torch.rand((FULL_RANK_H, TARGET_SIZE)) output_matrix = W @ input_matrix - print("Original output:") - print(output_matrix) L_sensitive, R_sensitive = low_rank_decomposition( W, LOW_RANK, - learning_rate=1e-6, - max_iterations=1000, + learning_rate=1e-9, + max_iterations=2000, X=input_matrix ) + reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix + print(f"reconstructed mse: gd: {F.mse_loss(output_matrix, reconstructed_matrix)}") + L_noinput, R_noinput = low_rank_decomposition( W, LOW_RANK, - learning_rate=1e-6, - max_iterations=1000, + learning_rate=1e-9, + max_iterations=2000, ) - L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-6, 1000) - - reconstructed_matrix = L_sensitive @ R_sensitive @ input_matrix - reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix reconstructed_matrix_noinput = L_noinput @ R_noinput @ input_matrix - print("difference:") - print(f"gd: {F.mse_loss(output_matrix, reconstructed_matrix)}") - print(f"autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}") - print(f"noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}") + print(f"reconstructed mse: gd. noinput gd: {F.mse_loss(output_matrix, reconstructed_matrix_noinput)}") + + L_autograd, R_autograd = torch_autograd(W, input_matrix, LOW_RANK, 1e-9, 2000) + reconstructed_matrix_pca = L_autograd @ R_autograd @ input_matrix + print(f"reconstructed mse: autograd: {F.mse_loss(output_matrix, reconstructed_matrix_pca)}") + + diff --git a/matq.py b/matq.py index 9a40374..11d80ac 100644 --- a/matq.py +++ b/matq.py @@ -7,7 +7,7 @@ from quant import quantize from decomposition import low_rank_decomposition -DEBUG = False +DEBUG = False torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False @@ -73,7 +73,7 @@ def decompose(self): W = W.float() logger.info("starting decomposition") tick = time.time() - self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-6, max_iterations=10000, input_matrix=self.inp) + self.L, self.R = low_rank_decomposition(W, self.rank, learning_rate=1e-10, max_iterations=5000, X=self.inp) logger.info(f"decomposition done. elapsed time: {time.time() - tick}, L: {self.L.shape}, R: {self.R.shape}") def lr_quant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, decompose_only=False): diff --git a/scripts/lr_quant_350m.sh b/scripts/lr_quant_350m.sh new file mode 100644 index 0000000..4710196 --- /dev/null +++ b/scripts/lr_quant_350m.sh @@ -0,0 +1,10 @@ +python cli.py \ + --dataset wikitext2 \ + --target-model lnair/opt-350m-wikitext2 \ + --base-model facebook/opt-350m \ + --delta \ + --rank 32 \ + --save outputs/ \ + --nsamples 128 \ + --decompose-only \ + --wbits 8 diff --git a/to_hf.py b/to_hf.py index 2deb4a5..f450de4 100644 --- a/to_hf.py +++ b/to_hf.py @@ -11,7 +11,7 @@ from modelutils import find_layers from copy import deepcopy target_model = deepcopy(base_model) -MODEL_ID = "lnair.opt-1.3b-wikitext2-r256-w8-lr" +MODEL_ID = "lnair.opt-1.3b-wikitext2-r32-w8-decompose.True-lr" tensors = load_lr_tensors(f"outputs/{MODEL_ID}.safetensors") target_layers = target_model.model.decoder.layers From 6440eb03aa6445605d80b6664fc488e88f7da15b Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Thu, 11 May 2023 00:42:48 +0000 Subject: [PATCH 10/23] add queuing jobs helper --- cli.py | 2 +- opt_delta.py | 18 +- opt_delta_fork.py | 662 ------------------------------------------ scripts/gptq_delta.sh | 68 +++++ submit.py | 29 ++ 5 files changed, 112 insertions(+), 667 deletions(-) delete mode 100644 opt_delta_fork.py create mode 100644 scripts/gptq_delta.sh create mode 100644 submit.py diff --git a/cli.py b/cli.py index eb1850d..dece358 100644 --- a/cli.py +++ b/cli.py @@ -81,6 +81,6 @@ def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_s ) if args.save: save_lr_tensors(lr_tensors, f"{args.save}/{args.target_model.replace('/', '.')}-r{args.rank}-w{args.wbits}-lr.safetensors") - + ppl = opt_eval(target_model, loader_enc, args, target_model.device) logger.info(f"Perplexity: {ppl}") \ No newline at end of file diff --git a/opt_delta.py b/opt_delta.py index f9f6bbf..bb55f7f 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -7,7 +7,7 @@ from gptq import * from modelutils import * from quant import * - +from transformers import AutoTokenizer, AutoModel import copy #from prettytable import PrettyTable @@ -425,7 +425,6 @@ def forward(self, *inp, **kwargs): def benchmark(model, input_ids, check=False): input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) torch.cuda.synchronize() - cache = {'past': None} def clear_past(i): def tmp(layer, inp, out): @@ -545,11 +544,18 @@ def main(args): if args.rank > 0: print("Number of params without low rank ", num_params) print("Number of params with low rank", num_params - num_params_saved_lr) - if args.save: + if args.save_hf: + if args.delta: + hf_path = f"outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz" + else: + hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits" + model.save_pretrained(hf_path) + tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer.save_pretrained(hf_path) + else: opt_pack3(model, quantizers) torch.save(model.state_dict(), args.save) - if __name__ == '__main__': import argparse from datautils import * @@ -636,6 +642,10 @@ def main(args): '--sparsify_hard_threshold', action='store_true', help='Whether to add sparsity' ) + parser.add_argument( + '--save-hf', action='store_true', default=False, + help='Whether to save a huggingface model' + ) parser.add_argument( '--fraction_of_zero', type=float, default=0.99, help='Sparsity ratio' diff --git a/opt_delta_fork.py b/opt_delta_fork.py deleted file mode 100644 index 9a7710c..0000000 --- a/opt_delta_fork.py +++ /dev/null @@ -1,662 +0,0 @@ -import time - -import torch -import torch.nn as nn - -from gptq import * -from modelutils import * -from quant import * -import json -import pickle -import copy -#from prettytable import PrettyTable - -def get_opt(model): - import torch - def skip(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - from transformers import OPTForCausalLM - # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') - model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16) - model.seqlen = model.config.max_position_embeddings - return model - -def hard_threshold(x, fraction_of_zero=0.1): - y, _ = torch.sort(x.view(-1).abs().clone()) - num_params = torch.numel(x) - thresh_index = int(num_params * fraction_of_zero) - threshold = y[thresh_index] - mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) - return mask * x - -@torch.no_grad() -def opt_sequential_delta(model, delta_model, dataloader, dev): - print('Starting ...') - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.decoder.layers - delta_layers = delta_model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - raise ValueError - layers[0] = Catcher(layers[0]) - for batch in dataloader: - try: - model(batch[0].to(dev)) - except ValueError: - pass - layers[0] = layers[0].module - - layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - original_outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - - print('Ready.') - - quantizers = {} - for i in range(len(delta_layers)): - layer = delta_layers[i].to(dev) - original_layer = layers[i].to(dev) - - subset = find_layers(layer) - gptq = {} - for name in subset: - gptq[name] = GPTQ(subset[name]) - gptq[name].quantizer = Quantizer() - gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits - ) - - def add_batch(name): - def tmp(_, inp, out): - gptq[name].add_batch(inp[0].data, out.data) - return tmp - handles = [] - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) - - for j in range(args.nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - - original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - for h in handles: - h.remove() - - for name in subset: - print(i, name) - print('Quantizing ...') - gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) - quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer - gptq[name].free() - for j in range(args.nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - - layers[i] = layer.cpu() - del layer - del gptq - torch.cuda.empty_cache() - - inps, outs = original_outs, inps - - model.config.use_cache = use_cache - - return quantizers - -@torch.no_grad() -def opt_sequential(model, dataloader, dev): - print('Starting ...') - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - raise ValueError - layers[0] = Catcher(layers[0]) - for batch in dataloader: - try: - model(batch[0].to(dev)) - except ValueError: - pass - layers[0] = layers[0].module - - layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - - print('Ready.') - - quantizers = {} - for i in range(len(layers)): - layer = layers[i].to(dev) - - subset = find_layers(layer) - gptq = {} - for name in subset: - gptq[name] = GPTQ(subset[name]) - gptq[name].quantizer = Quantizer() - gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits - ) - - def add_batch(name): - def tmp(_, inp, out): - gptq[name].add_batch(inp[0].data, out.data) - return tmp - handles = [] - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) - for j in range(args.nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - for h in handles: - h.remove() - - for name in subset: - print(i, name) - print('Quantizing ...') - gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) - quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer - gptq[name].free() - for j in range(args.nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - - layers[i] = layer.cpu() - del layer - del gptq - torch.cuda.empty_cache() - - inps, outs = outs, inps - - model.config.use_cache = use_cache - - return quantizers - - -@torch.no_grad() -def opt_eval(model, testenc, dev): - print('Evaluating ...') - - testenc = testenc.input_ids - nsamples = testenc.numel() // model.seqlen - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - raise ValueError - layers[0] = Catcher(layers[0]) - for i in range(nsamples): - batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) - try: - model(batch) - except ValueError: - pass - layers[0] = layers[0].module - - layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - - for i in range(len(layers)): - # print(i) - layer = layers[i].to(dev) - - if args.nearest: - subset = find_layers(layer) - for name in subset: - quantizer = Quantizer() - quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False - ) - W = subset[name].weight.data - quantizer.find_params(W, weight=True) - subset[name].weight.data = quantize( - W, quantizer.scale, quantizer.zero, quantizer.maxq - ).to(next(iter(layer.parameters())).dtype) - - for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - layers[i] = layer.cpu() - del layer - torch.cuda.empty_cache() - inps, outs = outs, inps - - if model.model.decoder.final_layer_norm is not None: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) - if model.model.decoder.project_out is not None: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - model.lm_head = model.lm_head.to(dev) - - testenc = testenc.to(dev) - nlls = [] - for i in range(nsamples): - hidden_states = inps[i].unsqueeze(0) - if model.model.decoder.final_layer_norm is not None: - hidden_states = model.model.decoder.final_layer_norm(hidden_states) - if model.model.decoder.project_out is not None: - hidden_states = model.model.decoder.project_out(hidden_states) - lm_logits = model.lm_head(hidden_states) - shift_logits = lm_logits[:, :-1, :].contiguous() - shift_labels = testenc[ - :, (i * model.seqlen):((i + 1) * model.seqlen) - ][:, 1:] - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - neg_log_likelihood = loss.float() * model.seqlen - nlls.append(neg_log_likelihood) - ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) - print(ppl.item()) - - model.config.use_cache = use_cache - return ppl.item() - -# TODO: perform packing on GPU -def opt_pack3(model, quantizers): - layers = find_layers(model) - layers = {n: layers[n] for n in quantizers} - make_quant3(model, quantizers, faster=args.faster_kernel) - qlayers = find_layers(model, [Quant3Linear]) - print('Packing ...') - for name in qlayers: - print(name) - quantizers[name] = quantizers[name].cpu() - qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero) - print('Done.') - return model - -def load_quant3(model, checkpoint): - from transformers import OPTConfig, OPTForCausalLM - config = OPTConfig.from_pretrained(model) - def noop(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop - - torch.set_default_dtype(torch.half) - transformers.modeling_utils._init_weights = False - torch.set_default_dtype(torch.half) - model = OPTForCausalLM(config) - torch.set_default_dtype(torch.float) - model = model.eval() - layers = find_layers(model) - for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']: - if name in layers: - del layers[name] - make_quant3(model, layers, faster=args.faster_kernel) - - print('Loading model ...') - model.load_state_dict(torch.load(checkpoint)) - model.seqlen = model.config.max_position_embeddings - print('Done.') - - return model - -def opt_multigpu(model, gpus): - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0]) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0]) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0]) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1]) - if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1]) - import copy - model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) - - cache = {'mask': None} - - class MoveModule(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - self.dev = next(iter(self.module.parameters())).device - def forward(self, *inp, **kwargs): - inp = list(inp) - if inp[0].device != self.dev: - inp[0] = inp[0].to(self.dev) - if cache['mask'] is None or cache['mask'].device != self.dev: - cache['mask'] = kwargs['attention_mask'].to(self.dev) - kwargs['attention_mask'] = cache['mask'] - tmp = self.module(*inp, **kwargs) - return tmp - - layers = model.model.decoder.layers - pergpu = math.ceil(len(layers) / len(gpus)) - for i in range(len(layers)): - layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) - - model.gpus = gpus - -def benchmark(model, input_ids, check=False): - input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) - torch.cuda.synchronize() - - cache = {'past': None} - def clear_past(i): - def tmp(layer, inp, out): - if cache['past']: - cache['past'][i] = None - return tmp - for i, layer in enumerate(model.model.decoder.layers): - layer.register_forward_hook(clear_past(i)) - - print('Benchmarking ...') - - if check: - loss = nn.CrossEntropyLoss() - tot = 0. - - def sync(): - if hasattr(model, 'gpus'): - for gpu in model.gpus: - torch.cuda.synchronize(gpu) - else: - torch.cuda.synchronize() - with torch.no_grad(): - attention_mask = torch.ones((1, input_ids.numel()), device=DEV) - times = [] - for i in range(input_ids.numel()): - tick = time.time() - out = model( - input_ids[:, i].reshape(-1), - past_key_values=cache['past'], - attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) - ) - sync() - times.append(time.time() - tick) - print(i, times[-1]) - if check and i != input_ids.numel() - 1: - tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() - cache['past'] = list(out.past_key_values) - del out - sync() - import numpy as np - print('Median:', np.median(times)) - if check: - print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) - - -def main(args): - print(args) - num_params_saved_lr = 0 - num_params = 0 - if args.load: - model = load_quant3(args.model, args.load) - else: - if args.delta and args.wbits<16: - model = get_opt(args.model) - model.eval() - base_model = get_opt(args.base_model) - base_model.eval() - dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - original_finetuned_model = copy.deepcopy(model) - for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): - finetuned_p.data = (finetuned_p.data-base_p.data).clone() - else: - model = get_opt(args.model) - model.eval() - - dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - - if args.wbits < 16 and not args.nearest: - if args.delta: - tick = time.time() - quantizers = opt_sequential_delta(original_finetuned_model, model, dataloader, DEV) - - comp_time = time.time()-tick - else: - quantizers = opt_sequential(model, dataloader, DEV) - - if args.delta and args.wbits<16: - for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): - # don't hard threshold for now - # if args.sparsify_hard_threshold: - # print('Hard Thresholding...') - # W = finetuned_p.data - # finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) - if args.rank>0 and len(finetuned_p.shape) == 2: - print('Finding Low Rank Approximation...') - A = finetuned_p.data.float() - U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) - # let's say L = U - # and R = diag(S)*V.T - L = U - R = torch.diag_embed(S) @ Vh.T - # now quantize R - - A = L @ R - - finetuned_p.data = A.half() - num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) - num_params += torch.numel(finetuned_p.data) - finetuned_p.data = (base_p.data + finetuned_p.data).clone() - - if args.benchmark: - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - opt_multigpu(model, gpus) - else: - model = model.to(DEV) - if args.benchmark: - input_ids = next(iter(dataloader))[0][:, :args.benchmark] - benchmark(model, input_ids, check=args.check) - if args.load: - exit() - - dataset = args.dataset - dataloader, testloader = get_loaders( - dataset, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - - ppl = opt_eval(model, testloader, DEV) - print(ppl) - - if args.rank > 0: - print("Number of params without low rank ", num_params) - print("Number of params with low rank", num_params - num_params_saved_lr) - if args.save: - opt_pack3(model, quantizers) - torch.save(model.state_dict(), args.save) - - -if __name__ == '__main__': - import argparse - from datautils import * - - parser = argparse.ArgumentParser() - - parser.add_argument( - '--model', type=str, default='lnair/opt-1.3b-wikitext2', - help='OPT model to load; pass `facebook/opt-X`.' - ) - parser.add_argument( - '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], default='wikitext2', - help='Where to extract calibration data from.' - ) - parser.add_argument( - '--base-model', type=str, default='facebook/opt-1.3b', - help='base OPT model to load' - ) - parser.add_argument( - '--seed', - type=int, default=0, help='Seed for sampling the calibration data.' - ) - parser.add_argument( - '--nsamples', type=int, default=128, - help='Number of calibration data samples.' - ) - parser.add_argument( - '--percdamp', type=float, default=.01, - help='Percent of the average Hessian diagonal to use for dampening.' - ) - parser.add_argument( - '--nearest', action='store_true', - help='Whether to run the RTN baseline.' - ) - parser.add_argument( - '--wbits', type=int, default=2, choices=[2, 3, 4, 16], - help='#bits to use for quantization; use 16 for evaluating base model.' - ) - parser.add_argument( - '--trits', action='store_true', - help='Whether to use trits for quantization.' - ) - parser.add_argument( - '--groupsize', type=int, default=-1, - help='Groupsize to use for quantization; default uses full row.' - ) - parser.add_argument( - '--sym', action='store_true', - help='Whether to perform symmetric quantization.' - ) - parser.add_argument( - '--save', type=str, default='', - help='Save quantized checkpoint under this name.' - ) - parser.add_argument( - '--load', type=str, default='', - help='Load quantized model.' - ) - parser.add_argument( - '--benchmark', type=int, default=0, - help='Number of tokens to use for benchmarking.' - ) - parser.add_argument( - '--check', action='store_true', - help='Whether to compute perplexity during benchmarking for verification.' - ) - parser.add_argument( - '--new-eval', action='store_true', - help='Whether to use the new PTB and C4 eval.' - ) - parser.add_argument( - '--faster-kernel', action='store_true', - help='Whether to use the new faster kernel for benchmarking.' - ) - parser.add_argument( - '--act-order', action='store_true', - help='Whether to apply the activation order GPTQ heuristic' - ) - parser.add_argument( - '--delta', action='store_true', - help='Whether to use delta compression' - ) - parser.add_argument( - '--sparsify_hard_threshold', action='store_true', - help='Whether to add sparsity' - ) - parser.add_argument( - '--fraction_of_zero', type=float, default=0.99, - help='Sparsity ratio' - ) - - parser.add_argument( - '--rank', type=int, default=0, - help='The rank to use for decomposing each matrices' - ) - args = parser.parse_args() - - #results = PrettyTable() - - main(args) - - print('finished.') diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh new file mode 100644 index 0000000..e73d717 --- /dev/null +++ b/scripts/gptq_delta.sh @@ -0,0 +1,68 @@ +ts -S 8 +CUDA_VISIBLE_DEVICES=0 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 2 \ + --delta \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.95 \ + --save-hf \ + --groupsize 1024 & + +CUDA_VISIBLE_DEVICES=1 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 3 \ + --delta \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.95 \ + --save-hf \ + --groupsize 1024 & + +CUDA_VISIBLE_DEVICES=2 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 4 \ + --delta \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.95 \ + --save-hf \ + --groupsize 1024 & + +CUDA_VISIBLE_DEVICES=3 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 2 \ + --delta \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 \ + --save-hf \ + --groupsize 1024 & + +CUDA_VISIBLE_DEVICES=4 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 3 \ + --delta \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 \ + --save-hf \ + --groupsize 1024 & + +CUDA_VISIBLE_DEVICES=5 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 4 \ + --delta \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 \ + --save-hf \ + --groupsize 1024 & + +CUDA_VISIBLE_DEVICES=6 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 3 \ + --delta \ + --save-hf \ + --groupsize 1024 & + +CUDA_VISIBLE_DEVICES=7 python opt_delta.py \ + --dataset wikitext2 \ + --wbits 4 \ + --delta \ + --save-hf \ + --groupsize 1024 & \ No newline at end of file diff --git a/submit.py b/submit.py new file mode 100644 index 0000000..075e8cb --- /dev/null +++ b/submit.py @@ -0,0 +1,29 @@ +import os +model_relations = { + # 'facebook/opt-350m': ['lnair/opt-350m-wikitext2'], + # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'], + # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'], + 'facebook/opt-6.7b': ['KoboldAI/OPT-6.7B-Erebus'], + 'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'], + 'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus'] +} + +wbits_settings = [2,3,4] + +sparsity_settings = [0, 0.95, 0.99] +os.system("ts -S 8") +for model in model_relations.keys(): + for target_model in model_relations[model]: + for wbits in wbits_settings: + for sparsity in sparsity_settings: + if sparsity == 0: + cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --save-hf --groupsize 1024" + else: + cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --sparsify_hard_threshold --fraction_of_zero {sparsity} --save-hf --groupsize 1024" + os.system(cmd) + +for model in model_relations.keys(): + for target_model in model_relations[model]: + for wbits in wbits_settings: + cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --model {target_model} --base-model {model} --save-hf --groupsize 1024" + os.system(cmd) \ No newline at end of file From ce294310c97cc6c6e464ccee673d12718f66629c Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Thu, 11 May 2023 12:54:30 +0000 Subject: [PATCH 11/23] minor --- .gitignore | 4 ++- modelutils.py | 2 +- opt_delta.py | 19 ++++++++------ scripts/playground.ipynb | 53 ++++++++++++++-------------------------- submit.py | 7 +++--- 5 files changed, 37 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index dbd6338..e7f3a29 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,6 @@ dist/ .idea *.egg-info/ *.safetensors -outputs/ \ No newline at end of file +outputs/ +outputs_past/ +packed_delta \ No newline at end of file diff --git a/modelutils.py b/modelutils.py index c93410d..f9436c7 100644 --- a/modelutils.py +++ b/modelutils.py @@ -1,6 +1,6 @@ import torch import torch.nn as nn -from transformers import OPTForCausalLM +from transformers import OPTForCausalLM, AutoModel, AutoTokenizer DEV = torch.device('cuda:0') def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): diff --git a/opt_delta.py b/opt_delta.py index bb55f7f..401c88b 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -472,6 +472,7 @@ def sync(): def main(args): print(args) + packed_delta = None num_params_saved_lr = 0 num_params = 0 if args.load: @@ -506,19 +507,21 @@ def main(args): quantizers = opt_sequential(model, dataloader, DEV) if args.delta and args.wbits<16: - for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())): if args.sparsify_hard_threshold: print('Hard Thresholding...') W = finetuned_p.data finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) - if args.rank>0 and len(finetuned_p.shape) == 2: - print('Finding Low Rank Approximation...') - A = finetuned_p.data.float() - U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) - A = U @ torch.diag_embed(S) @ Vh.T - finetuned_p.data = A.half() - num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) + # if args.rank>0 and len(finetuned_p.shape) == 2: + # print('Finding Low Rank Approximation...') + # A = finetuned_p.data.float() + # U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) + # A = U @ torch.diag_embed(S) @ Vh.T + # finetuned_p.data = A.half() + # num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) num_params += torch.numel(finetuned_p.data) + # here we save a copy to pack, and save the delta only on disk + packed_delta = copy.deepcopy(finetuned_p.data) finetuned_p.data = (base_p.data + finetuned_p.data).clone() if args.benchmark: diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb index 88e2175..114a004 100644 --- a/scripts/playground.ipynb +++ b/scripts/playground.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -25,42 +25,25 @@ "output_type": "stream", "text": [ "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + " from .autonotebook import tqdm as notebook_tqdm\n", + "Downloading (…)lve/main/config.json: 100%|██████████| 930/930 [00:00<00:00, 143kB/s]\n", + "You are using a model of type gptj to instantiate a model of type opt. This is not supported for all configurations of models and can yield errors.\n", + "Downloading pytorch_model.bin: 100%|██████████| 24.2G/24.2G [02:56<00:00, 137MB/s] \n" ] }, { - "data": { - "text/plain": [ - "OPTForCausalLM(\n", - " (model): OPTModel(\n", - " (decoder): OPTDecoder(\n", - " (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n", - " (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n", - " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " (layers): ModuleList(\n", - " (0-23): 24 x OPTDecoderLayer(\n", - " (self_attn): OPTAttention(\n", - " (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " )\n", - " (activation_fn): ReLU()\n", - " (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n", - " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n", - " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n", - ")" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "ename": "ValueError", + "evalue": "The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mmodelutils\u001b[39;00m \u001b[39mimport\u001b[39;00m get_opt\n\u001b[0;32m----> 2\u001b[0m base_model \u001b[39m=\u001b[39m get_opt(base_model_name)\n\u001b[1;32m 3\u001b[0m target_model \u001b[39m=\u001b[39m get_opt(target_model_name)\n\u001b[1;32m 4\u001b[0m base_model\u001b[39m.\u001b[39mto(\u001b[39m'\u001b[39m\u001b[39mcuda\u001b[39m\u001b[39m'\u001b[39m)\n", + "File \u001b[0;32m~/project/fmzip/scripts/../modelutils.py:24\u001b[0m, in \u001b[0;36mget_opt\u001b[0;34m(model)\u001b[0m\n\u001b[1;32m 21\u001b[0m torch\u001b[39m.\u001b[39mnn\u001b[39m.\u001b[39minit\u001b[39m.\u001b[39mnormal_ \u001b[39m=\u001b[39m skip\n\u001b[1;32m 23\u001b[0m \u001b[39m# model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m model \u001b[39m=\u001b[39m OPTForCausalLM\u001b[39m.\u001b[39;49mfrom_pretrained(model, torch_dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat16)\n\u001b[1;32m 25\u001b[0m model\u001b[39m.\u001b[39mseqlen \u001b[39m=\u001b[39m model\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mmax_position_embeddings\n\u001b[1;32m 26\u001b[0m \u001b[39mreturn\u001b[39;00m model\n", + "File \u001b[0;32m~/miniconda3/envs/fmzip/lib/python3.9/site-packages/transformers/modeling_utils.py:2795\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 2785\u001b[0m \u001b[39mif\u001b[39;00m dtype_orig \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 2786\u001b[0m torch\u001b[39m.\u001b[39mset_default_dtype(dtype_orig)\n\u001b[1;32m 2788\u001b[0m (\n\u001b[1;32m 2789\u001b[0m model,\n\u001b[1;32m 2790\u001b[0m missing_keys,\n\u001b[1;32m 2791\u001b[0m unexpected_keys,\n\u001b[1;32m 2792\u001b[0m mismatched_keys,\n\u001b[1;32m 2793\u001b[0m offload_index,\n\u001b[1;32m 2794\u001b[0m error_msgs,\n\u001b[0;32m-> 2795\u001b[0m ) \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49m_load_pretrained_model(\n\u001b[1;32m 2796\u001b[0m model,\n\u001b[1;32m 2797\u001b[0m state_dict,\n\u001b[1;32m 2798\u001b[0m loaded_state_dict_keys, \u001b[39m# XXX: rename?\u001b[39;49;00m\n\u001b[1;32m 2799\u001b[0m resolved_archive_file,\n\u001b[1;32m 2800\u001b[0m pretrained_model_name_or_path,\n\u001b[1;32m 2801\u001b[0m ignore_mismatched_sizes\u001b[39m=\u001b[39;49mignore_mismatched_sizes,\n\u001b[1;32m 2802\u001b[0m sharded_metadata\u001b[39m=\u001b[39;49msharded_metadata,\n\u001b[1;32m 2803\u001b[0m _fast_init\u001b[39m=\u001b[39;49m_fast_init,\n\u001b[1;32m 2804\u001b[0m low_cpu_mem_usage\u001b[39m=\u001b[39;49mlow_cpu_mem_usage,\n\u001b[1;32m 2805\u001b[0m device_map\u001b[39m=\u001b[39;49mdevice_map,\n\u001b[1;32m 2806\u001b[0m offload_folder\u001b[39m=\u001b[39;49moffload_folder,\n\u001b[1;32m 2807\u001b[0m offload_state_dict\u001b[39m=\u001b[39;49moffload_state_dict,\n\u001b[1;32m 2808\u001b[0m dtype\u001b[39m=\u001b[39;49mtorch_dtype,\n\u001b[1;32m 2809\u001b[0m load_in_8bit\u001b[39m=\u001b[39;49mload_in_8bit,\n\u001b[1;32m 2810\u001b[0m keep_in_fp32_modules\u001b[39m=\u001b[39;49mkeep_in_fp32_modules,\n\u001b[1;32m 2811\u001b[0m )\n\u001b[1;32m 2813\u001b[0m model\u001b[39m.\u001b[39mis_loaded_in_8bit \u001b[39m=\u001b[39m load_in_8bit\n\u001b[1;32m 2815\u001b[0m \u001b[39m# make sure token embedding weights are still tied if needed\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/fmzip/lib/python3.9/site-packages/transformers/modeling_utils.py:3008\u001b[0m, in \u001b[0;36mPreTrainedModel._load_pretrained_model\u001b[0;34m(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, load_in_8bit, keep_in_fp32_modules)\u001b[0m\n\u001b[1;32m 3006\u001b[0m base_model_expected_keys \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(model_to_load\u001b[39m.\u001b[39mstate_dict()\u001b[39m.\u001b[39mkeys())\n\u001b[1;32m 3007\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39many\u001b[39m(key \u001b[39min\u001b[39;00m expected_keys_not_prefixed \u001b[39mand\u001b[39;00m key \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m base_model_expected_keys \u001b[39mfor\u001b[39;00m key \u001b[39min\u001b[39;00m loaded_keys):\n\u001b[0;32m-> 3008\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 3009\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mThe state dictionary of the model you are trying to load is corrupted. Are you sure it was \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 3010\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mproperly saved?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 3011\u001b[0m )\n\u001b[1;32m 3012\u001b[0m \u001b[39mif\u001b[39;00m device_map \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 3013\u001b[0m device_map \u001b[39m=\u001b[39m {k\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mbase_model_prefix\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m): v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m device_map\u001b[39m.\u001b[39mitems()}\n", + "\u001b[0;31mValueError\u001b[0m: The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?" + ] } ], "source": [ diff --git a/submit.py b/submit.py index 075e8cb..ce84f1f 100644 --- a/submit.py +++ b/submit.py @@ -3,9 +3,10 @@ # 'facebook/opt-350m': ['lnair/opt-350m-wikitext2'], # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'], # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'], - 'facebook/opt-6.7b': ['KoboldAI/OPT-6.7B-Erebus'], - 'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'], - 'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus'] + 'facebook/opt-6.7b': ['mit-han-lab/opt-6.7b-smoothquant'], + # 'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'], + # 'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus'] + # 'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b'] } wbits_settings = [2,3,4] From d75ff123a8b402961fb9d31cbebc6390704d4342 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Thu, 11 May 2023 17:33:47 +0000 Subject: [PATCH 12/23] pack utils --- pack_utils.py | 129 +++++++++++++++++++++++++++++++++++++++++++++ pack_utils_test.py | 35 ++++++++++++ quant.py | 2 +- requirements.txt | 3 +- 4 files changed, 167 insertions(+), 2 deletions(-) create mode 100644 pack_utils.py create mode 100644 pack_utils_test.py diff --git a/pack_utils.py b/pack_utils.py new file mode 100644 index 0000000..1305c05 --- /dev/null +++ b/pack_utils.py @@ -0,0 +1,129 @@ +import math +import torch +import numpy as np +from typing import Any +from quant import Quantizer +from safetensors import safe_open +from safetensors.torch import save_file + +def pack_to_bits( + weight: torch.Tensor, + quantizer:Quantizer, + bits: int, + groupsize = 1024 + ): + if bits not in [2,3,4,8]: + raise ValueError("bits must be one of [2,3,4,8]") + scales = quantizer.scale.t().contiguous() + zeros = quantizer.zero.t().contiguous() + scale_zeros = zeros * scales + intweight = [] + for idx in range(weight.shape[0]): + g_idx = idx // groupsize + intweight.append(torch.round((weight[:,idx] + scale_zeros[g_idx]) / scales[g_idx]).to(torch.int)[:,None]) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + qweight = np.zeros( + (intweight.shape[0] // 256 * (bits * 8), intweight.shape[1]), dtype=np.uint32 + ) + i = 0 + row = 0 + while row < qweight.shape[0]: + if bits in [2,4,8]: + for j in range(i, i + (32//bits)): + qweight[row] |= intweight[j] << (bits * (j - i)) + i += 32//bits + row += 1 + elif bits == 3: + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i)) + i += 10 + qweight[row] |= intweight[i] << 30 + row += 1 + qweight[row] |= (intweight[i] >> 2) & 1 + i += 1 + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i) + 1) + i += 10 + qweight[row] |= intweight[i] << 31 + row += 1 + qweight[row] |= (intweight[i] >> 1) & 0x3 + i += 1 + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i) + 2) + i += 10 + row += 1 + + qweight = qweight.astype(np.int32) + qweight = torch.from_numpy(qweight) + zeros -= 1; + zeros = zeros.numpy().astype(np.uint32) + qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (bits * 8)), dtype=np.uint32) + i = 0 + col = 0 + while col < qzeros.shape[1]: + if bits in [2,4,8]: + for j in range(i, i + (32//bits)): + qzeros[:, col] |= zeros[:, j] << (bits * (j - i)) + i += 32//bits + col += 1 + elif bits == 3: + for j in range(i, i + 10): + qzeros[:, col] |= zeros[:, j] << (3 * (j - i)) + i += 10 + qzeros[:, col] |= zeros[:, i] << 30 + col += 1 + qzeros[:, col] |= (zeros[:, i] >> 2) & 1 + i += 1 + for j in range(i, i + 10): + qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1) + i += 10 + qzeros[:, col] |= zeros[:, i] << 31 + col += 1 + qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3 + i += 1 + for j in range(i, i + 10): + qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2) + i += 10 + col += 1 + qzeros = qzeros.astype(np.int32) + qzeros = torch.from_numpy(qzeros) + return qweight, qzeros + +class SparseTensor(): + def __init__(self, m: torch.Tensor, format: str, packing_bits: None) -> None: + self.m = m + self.size = m.size() + self.packing_bits = packing_bits + self.format = format + self._convert() + + def _convert(self): + # flatten the matrix + self.m = self.m.flatten() + # get the indices of the non-zero elements + indices = torch.nonzero(self.m) + # get the non-zero elements + values = self.m[indices] + self.payload = { + 'indices': indices, + 'values': values + } + + def restore(self): + # restore the matrix from the self.payload + self.m = torch.zeros(math.prod(self.size), dtype=self.payload['values'].dtype) + self.m[self.payload['indices']] = self.payload['values'] + self.m = self.m.reshape(self.size) + + def to_disk(self, path): + save_file(self.payload, path) + + def from_disk(self, path): + tensors = {} + with safe_open(path, framework='pt', device='cpu') as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key) + self.payload = tensors + self.restore() \ No newline at end of file diff --git a/pack_utils_test.py b/pack_utils_test.py new file mode 100644 index 0000000..483c91e --- /dev/null +++ b/pack_utils_test.py @@ -0,0 +1,35 @@ +import torch +from quant import quantize, Quantizer +from safetensors import safe_open +from pack_utils import SparseTensor, pack_to_bits, unpack_from_bits +from safetensors.torch import save_file + +QUANTIZED_BITS = 3 + + +if __name__=="__main__": + torch.set_printoptions(precision=12) + b = torch.rand((1024, 1024), dtype=torch.float16) + + quantizer = Quantizer() + quantizer.configure( + QUANTIZED_BITS, perchannel=True, sym=False, mse=False + ) + quantizer.find_params(b, weight=True) + b_q = quantizer.quantize(b) + + # count how many zeroes + print(b_q) + # sparsification + + # now pack it + q_weight, qzero = pack_to_bits(b_q, quantizer, QUANTIZED_BITS) + unpacked_weight = unpack_from_bits( + qweight=q_weight, + qzeros=qzero, + quantizer=quantizer, + bits=QUANTIZED_BITS, + groupsize=1024, + ) + print(unpacked_weight) + # count how many zeroes diff --git a/quant.py b/quant.py index f8cc1b7..f23099a 100644 --- a/quant.py +++ b/quant.py @@ -287,7 +287,7 @@ def pack(self, linear, scales, zeros): raise NotImplementedError("Only 2,3,4,8 bits are supported.") qweight = qweight.astype(np.int32) - self.qweight = torch.from_numpy(qweight) + self.qweight = torch.from_numpy(qweight) zeros -= 1; zeros = zeros.numpy().astype(np.uint32) diff --git a/requirements.txt b/requirements.txt index 7417000..79d456c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ transformers loguru -datasets \ No newline at end of file +datasets +safetensors \ No newline at end of file From 23e35487f92294f2aa0a5e88e746debc25bee737 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Thu, 11 May 2023 20:11:05 +0000 Subject: [PATCH 13/23] minor --- .gitignore | 3 +- .vscode/settings.json | 6 ++ pack_utils.py | 80 ++++++++++++++---------- pack_utils_test.py | 31 +++++---- playground.py | 142 ++++++++++++++++++++++++++++++++++++++++++ quant_cuda.cpp | 2 +- 6 files changed, 213 insertions(+), 51 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 playground.py diff --git a/.gitignore b/.gitignore index e7f3a29..6eb567c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ dist/ *.safetensors outputs/ outputs_past/ -packed_delta \ No newline at end of file +packed_delta +.cache \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..d99f2f3 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "python.formatting.provider": "none" +} \ No newline at end of file diff --git a/pack_utils.py b/pack_utils.py index 1305c05..307449f 100644 --- a/pack_utils.py +++ b/pack_utils.py @@ -21,6 +21,7 @@ def pack_to_bits( for idx in range(weight.shape[0]): g_idx = idx // groupsize intweight.append(torch.round((weight[:,idx] + scale_zeros[g_idx]) / scales[g_idx]).to(torch.int)[:,None]) + intweight = torch.cat(intweight, dim=1) intweight = intweight.t().contiguous() intweight = intweight.numpy().astype(np.uint32) @@ -35,6 +36,7 @@ def pack_to_bits( qweight[row] |= intweight[j] << (bits * (j - i)) i += 32//bits row += 1 + elif bits == 3: for j in range(i, i + 10): qweight[row] |= intweight[j] << (3 * (j - i)) @@ -57,49 +59,61 @@ def pack_to_bits( qweight = qweight.astype(np.int32) qweight = torch.from_numpy(qweight) - zeros -= 1; - zeros = zeros.numpy().astype(np.uint32) - qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (bits * 8)), dtype=np.uint32) + return qweight + +def unpack_from_bits( + qweight: torch.Tensor, + quantizer:Quantizer, + bits: int, + groupsize = 1024 + ): + if bits not in [2,3,4,8]: + raise ValueError("bits must be one of [2,3,4,8]") + + scales = quantizer.scale.t().contiguous() + zeros = quantizer.zero.t().contiguous() + scale_zeros = zeros * scales + qweight = qweight.numpy().astype(np.uint32) + + intweight = np.zeros( + (qweight.shape[0] // (bits * 8) * 256, qweight.shape[1]), dtype=np.uint32 + ) i = 0 - col = 0 - while col < qzeros.shape[1]: + row = 0 + while row < qweight.shape[0]: if bits in [2,4,8]: - for j in range(i, i + (32//bits)): - qzeros[:, col] |= zeros[:, j] << (bits * (j - i)) - i += 32//bits - col += 1 - elif bits == 3: - for j in range(i, i + 10): - qzeros[:, col] |= zeros[:, j] << (3 * (j - i)) - i += 10 - qzeros[:, col] |= zeros[:, i] << 30 - col += 1 - qzeros[:, col] |= (zeros[:, i] >> 2) & 1 - i += 1 - for j in range(i, i + 10): - qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1) - i += 10 - qzeros[:, col] |= zeros[:, i] << 31 - col += 1 - qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3 - i += 1 - for j in range(i, i + 10): - qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2) - i += 10 - col += 1 - qzeros = qzeros.astype(np.int32) - qzeros = torch.from_numpy(qzeros) - return qweight, qzeros + for j in range(i + 32//bits, i): + intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1) + i -= 32//bits + row += 1 + + intweight = intweight.astype(np.int32) + intweight = torch.from_numpy(intweight).t().contiguous() + + weight = [] + for idx in range(intweight.shape[0]): + g_idx = idx // groupsize + weight.append((intweight[:,idx] * scales[g_idx] - scale_zeros[g_idx]).to(torch.float32)[:,None]) + + weight = torch.cat(weight, dim=1) + return weight class SparseTensor(): - def __init__(self, m: torch.Tensor, format: str, packing_bits: None) -> None: + def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None: self.m = m self.size = m.size() - self.packing_bits = packing_bits + self.minifloats = minifloats self.format = format self._convert() def _convert(self): + if self.minifloats>=2: + quantizer = Quantizer() + quantizer.configure( + self.minifloats, perchannel=True, sym=False, mse=False + ) + quantizer.find_params(self.m, weight=True) + self.m = quantizer.quantize(self.m) # flatten the matrix self.m = self.m.flatten() # get the indices of the non-zero elements diff --git a/pack_utils_test.py b/pack_utils_test.py index 483c91e..1e1bce6 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -3,33 +3,32 @@ from safetensors import safe_open from pack_utils import SparseTensor, pack_to_bits, unpack_from_bits from safetensors.torch import save_file +from opt_delta import hard_threshold -QUANTIZED_BITS = 3 +QUANTIZED_BITS = 2 if __name__=="__main__": - torch.set_printoptions(precision=12) - b = torch.rand((1024, 1024), dtype=torch.float16) - + torch.set_printoptions(precision=4) + b = torch.rand((2048, 2048), dtype=torch.float32) + # save b + save_file({'wb1': b}, '.cache/b.safetensor') quantizer = Quantizer() quantizer.configure( QUANTIZED_BITS, perchannel=True, sym=False, mse=False ) quantizer.find_params(b, weight=True) b_q = quantizer.quantize(b) - - # count how many zeroes - print(b_q) + # sparsed_b_q = hard_threshold(b_q, 0.99) + # count how many zeroes # sparsification - # now pack it - q_weight, qzero = pack_to_bits(b_q, quantizer, QUANTIZED_BITS) - unpacked_weight = unpack_from_bits( - qweight=q_weight, - qzeros=qzero, - quantizer=quantizer, - bits=QUANTIZED_BITS, - groupsize=1024, - ) + q_weight = pack_to_bits(b_q, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0]) + unpacked_weight = unpack_from_bits(q_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0]) + # check if it's the same + print(b) print(unpacked_weight) + print(b-unpacked_weight) # count how many zeroes + sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1) + sparse_t.to_disk('.cache/sparse_b.safetensor') \ No newline at end of file diff --git a/playground.py b/playground.py new file mode 100644 index 0000000..67818cb --- /dev/null +++ b/playground.py @@ -0,0 +1,142 @@ +import torch +import numpy as np +import torch.nn as nn + +def quantize(x, scale, zero, maxq): + if maxq < 0: + return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero + q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) + return scale * (q - zero) + +class Quantizer(nn.Module): + + def __init__(self, shape=1): + super(Quantizer, self).__init__() + self.register_buffer('maxq', torch.tensor(0)) + self.register_buffer('scale', torch.zeros(shape)) + self.register_buffer('zero', torch.zeros(shape)) + + def configure( + self, + bits, perchannel=False, sym=True, + mse=False, norm=2.4, grid=100, maxshrink=.8, + ): + self.maxq = torch.tensor(2 ** bits - 1) + self.perchannel = perchannel + self.sym = sym + self.mse = mse + self.norm = norm + self.grid = grid + self.maxshrink = maxshrink + + def find_params(self, x, weight=False): + dev = x.device + self.maxq = self.maxq.to(dev) + shape = x.shape + if self.perchannel: + if weight: + x = x.flatten(1) + + tmp = torch.zeros(x.shape[0], device=dev) + xmin = torch.minimum(x.min(1)[0], tmp) + xmax = torch.maximum(x.max(1)[0], tmp) + + if self.sym: + xmax = torch.maximum(torch.abs(xmin), xmax) + tmp = xmin < 0 + if torch.any(tmp): + xmin[tmp] = -xmax[tmp] + tmp = (xmin == 0) & (xmax == 0) + xmin[tmp] = -1 + xmax[tmp] = +1 + + if self.maxq < 0: + self.scale = xmax + self.zero = xmin + else: + self.scale = (xmax - xmin) / self.maxq + if self.sym: + self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) + else: + self.zero = torch.round(-xmin / self.scale) + + if weight: + shape = [-1] + [1] * (len(shape) - 1) + self.scale = self.scale.reshape(shape) + self.zero = self.zero.reshape(shape) + print(shape) + print(self.scale) + print(self.zero) + return + + def quantize(self, x): + if self.ready(): + return quantize(x, self.scale, self.zero, self.maxq) + return x + + def enabled(self): + return self.maxq > 0 + + def ready(self): + return torch.all(self.scale != 0) + +def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2): + if stochastic: + x_floor = x.floor() + th = x - x_floor + if minimum_stochastic_distance > 0: + th[th1-minimum_stochastic_distance] = 1. + pr = torch.rand_like(x) + x_floor += (pr < th) + return x_floor + else: + return x.round() + +def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), + stochastic=False, minimum_stochastic_distance=0.2): + + fbits = bits - 1 + + if scale_method == 'max': + # issue: sensitive to outlier points + scale = x.abs().amax(scale_dims, keepdims=True) + elif scale_method == 'l2': + # ~95% confidence interval for normal distribution + scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 + else: + raise Exception('unkonwn scale method.') + # fp16 should be enough + scale = scale.half() + x = x / (scale + 1e-6) + + x = x.ldexp(torch.tensor(fbits)) + clip_min = -(1< Date: Thu, 11 May 2023 21:47:05 +0000 Subject: [PATCH 14/23] packing utils --- pack_utils.py | 23 ++++++++++++-- pack_utils_test.py | 6 ++-- playground.py | 74 +++++++--------------------------------------- 3 files changed, 35 insertions(+), 68 deletions(-) diff --git a/pack_utils.py b/pack_utils.py index 307449f..9f24ab9 100644 --- a/pack_utils.py +++ b/pack_utils.py @@ -82,9 +82,28 @@ def unpack_from_bits( row = 0 while row < qweight.shape[0]: if bits in [2,4,8]: - for j in range(i + 32//bits, i): + for j in range(i, i+ 32 // bits): intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1) - i -= 32//bits + i += 32//bits + row += 1 + elif bits == 3: + for j in range(i, i+10): + intweight[j] = (qweight[row] >> (3 * (j - i))) & 7 + i += 10 + intweight[i] = (qweight[row] >> 30) & 1 + row += 1 + intweight[i] |= (qweight[row] & 1) << 2 + i += 1 + for j in range(i, i+10): + intweight[j] = (qweight[row] >> (3 * (j - i) + 1)) & 7 + i += 10 + intweight[i] = (qweight[row] >> 31) & 1 + row += 1 + intweight[i] |= (qweight[row] & 3) << 1 + i += 1 + for j in range(i, i+10): + intweight[j] = (qweight[row] >> (3 * (j - i) + 2)) & 7 + i += 10 row += 1 intweight = intweight.astype(np.int32) diff --git a/pack_utils_test.py b/pack_utils_test.py index 1e1bce6..0af7e2d 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -5,7 +5,7 @@ from safetensors.torch import save_file from opt_delta import hard_threshold -QUANTIZED_BITS = 2 +QUANTIZED_BITS = 3 if __name__=="__main__": @@ -30,5 +30,5 @@ print(unpacked_weight) print(b-unpacked_weight) # count how many zeroes - sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1) - sparse_t.to_disk('.cache/sparse_b.safetensor') \ No newline at end of file + # sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1) + # sparse_t.to_disk('.cache/sparse_b.safetensor') \ No newline at end of file diff --git a/playground.py b/playground.py index 67818cb..ebeecac 100644 --- a/playground.py +++ b/playground.py @@ -64,9 +64,6 @@ def find_params(self, x, weight=False): shape = [-1] + [1] * (len(shape) - 1) self.scale = self.scale.reshape(shape) self.zero = self.zero.reshape(shape) - print(shape) - print(self.scale) - print(self.zero) return def quantize(self, x): @@ -80,63 +77,14 @@ def enabled(self): def ready(self): return torch.all(self.scale != 0) -def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2): - if stochastic: - x_floor = x.floor() - th = x - x_floor - if minimum_stochastic_distance > 0: - th[th1-minimum_stochastic_distance] = 1. - pr = torch.rand_like(x) - x_floor += (pr < th) - return x_floor - else: - return x.round() - -def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), - stochastic=False, minimum_stochastic_distance=0.2): - - fbits = bits - 1 - - if scale_method == 'max': - # issue: sensitive to outlier points - scale = x.abs().amax(scale_dims, keepdims=True) - elif scale_method == 'l2': - # ~95% confidence interval for normal distribution - scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 - else: - raise Exception('unkonwn scale method.') - # fp16 should be enough - scale = scale.half() - x = x / (scale + 1e-6) - - x = x.ldexp(torch.tensor(fbits)) - clip_min = -(1< Date: Thu, 11 May 2023 22:47:02 +0000 Subject: [PATCH 15/23] packing --- pack_utils.py | 2 +- pack_utils_test.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pack_utils.py b/pack_utils.py index 9f24ab9..cda817c 100644 --- a/pack_utils.py +++ b/pack_utils.py @@ -84,7 +84,7 @@ def unpack_from_bits( if bits in [2,4,8]: for j in range(i, i+ 32 // bits): intweight[j] = (qweight[row] >> (bits * (j - i))) & ((1 << bits) - 1) - i += 32//bits + i += 32 // bits row += 1 elif bits == 3: for j in range(i, i+10): diff --git a/pack_utils_test.py b/pack_utils_test.py index 0af7e2d..70dbb67 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -7,8 +7,11 @@ QUANTIZED_BITS = 3 - if __name__=="__main__": + """ + + """ + torch.set_printoptions(precision=4) b = torch.rand((2048, 2048), dtype=torch.float32) # save b From c06f73d177f716a954bfa6fbabedbf50f3424729 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Thu, 11 May 2023 23:03:17 +0000 Subject: [PATCH 16/23] packing utils --- pack_utils.py | 22 ++++++++++++++++------ pack_utils_test.py | 37 ++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/pack_utils.py b/pack_utils.py index cda817c..a68407b 100644 --- a/pack_utils.py +++ b/pack_utils.py @@ -123,8 +123,7 @@ def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None: self.size = m.size() self.minifloats = minifloats self.format = format - self._convert() - + def _convert(self): if self.minifloats>=2: quantizer = Quantizer() @@ -141,7 +140,8 @@ def _convert(self): values = self.m[indices] self.payload = { 'indices': indices, - 'values': values + 'values': values, + 'size': torch.tensor(self.size), } def restore(self): @@ -151,12 +151,22 @@ def restore(self): self.m = self.m.reshape(self.size) def to_disk(self, path): + self._convert() save_file(self.payload, path) - def from_disk(self, path): + @classmethod + def from_disk(cls, path): tensors = {} with safe_open(path, framework='pt', device='cpu') as f: for key in f.keys(): tensors[key] = f.get_tensor(key) - self.payload = tensors - self.restore() \ No newline at end of file + m = torch.zeros(math.prod(tensors['size']), dtype=tensors['values'].dtype) + m[tensors['indices']] = tensors['values'] + tensors['size'] = tensors['size'].tolist() + print(tensors['size']) + m = m.reshape(tensors['size']) + return cls(m, 'sparse', minifloats=-1) + + @property + def tensor(self): + return self.m \ No newline at end of file diff --git a/pack_utils_test.py b/pack_utils_test.py index 70dbb67..a6befb4 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -9,29 +9,36 @@ if __name__=="__main__": """ - + The process: + 1. Given a weight, quantize it first + 2. Then do sparsification + + To test our pack/unpack, we need to do the following: + 1. After the sparsification, we pack the weight and store on disk + 2. Compare the original weight with the unpacked weight """ torch.set_printoptions(precision=4) b = torch.rand((2048, 2048), dtype=torch.float32) # save b - save_file({'wb1': b}, '.cache/b.safetensor') + save_file({'wb1': b}, '.cache/original_b.safetensor') quantizer = Quantizer() quantizer.configure( QUANTIZED_BITS, perchannel=True, sym=False, mse=False ) quantizer.find_params(b, weight=True) b_q = quantizer.quantize(b) - # sparsed_b_q = hard_threshold(b_q, 0.99) - # count how many zeroes - # sparsification - # now pack it - q_weight = pack_to_bits(b_q, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0]) - unpacked_weight = unpack_from_bits(q_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0]) - # check if it's the same - print(b) - print(unpacked_weight) - print(b-unpacked_weight) - # count how many zeroes - # sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1) - # sparse_t.to_disk('.cache/sparse_b.safetensor') \ No newline at end of file + sparsed_b_q = hard_threshold(b_q, 0.99) + + q_weight = pack_to_bits(sparsed_b_q, quantizer, QUANTIZED_BITS, groupsize=sparsed_b_q.shape[0]) + sparse_t = SparseTensor(sparsed_b_q, 'wb1', minifloats=-1) + sparse_t.to_disk('.cache/sparse_b.safetensor') + # now load it back + restored_sparse_t = SparseTensor.from_disk('.cache/sparse_b.safetensor') + restored_weight = restored_sparse_t.tensor + print(f"Original weight: {sparsed_b_q}") + print(f"Restored weight: {restored_weight}") + print(torch.allclose(sparsed_b_q, restored_weight)) + # count the number of non-zero elements + print(f"Original weight: {sparsed_b_q.nonzero().shape[0]}") + print(f"Restored weight: {restored_weight.nonzero().shape[0]}") \ No newline at end of file From afd99db7cf582559aa79bba59728d324a22900a5 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Fri, 12 May 2023 06:12:55 +0000 Subject: [PATCH 17/23] packing --- .gitignore | 3 +- cli.py | 2 +- core_compression_parallel.py | 143 ----------------------------------- opt_delta.py | 15 ++-- pack_utils.py | 9 +-- pack_utils_test.py | 7 +- scripts/gptq_delta.sh | 107 +++++++++++++------------- tensorio.py | 54 +++++++++++++ 8 files changed, 127 insertions(+), 213 deletions(-) delete mode 100644 core_compression_parallel.py create mode 100644 tensorio.py diff --git a/.gitignore b/.gitignore index 6eb567c..87c07aa 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ dist/ outputs/ outputs_past/ packed_delta -.cache \ No newline at end of file +.cache +delta_outputs/ \ No newline at end of file diff --git a/cli.py b/cli.py index 17fef43..b0757b0 100644 --- a/cli.py +++ b/cli.py @@ -6,8 +6,8 @@ from modelutils import get_opt from evaluation import opt_eval from datautils import get_loaders -from save_and_load import save_lr_tensors, load_lr_tensors from core_compression import opt_delta_lr +from save_and_load import save_lr_tensors, load_lr_tensors @torch.no_grad() def quantize_with_lowrank(base_model, target_model, dataloader, rank, wbits, n_samples, decompose_only=False): diff --git a/core_compression_parallel.py b/core_compression_parallel.py deleted file mode 100644 index fca3c7d..0000000 --- a/core_compression_parallel.py +++ /dev/null @@ -1,143 +0,0 @@ -import torch -import torch.nn as nn -from tqdm import tqdm -from matq import TensorQ -from loguru import logger -from quant import Quantizer -from modelutils import find_layers -import multiprocessing as mp -@torch.no_grad() -def opt_delta_lr( - model, - delta_model, - dataloader, - nsamples, - wbits, - sym, - trits, - rank, - args - ): - device = model.device - print("Starting LR quantizer initialization...") - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.decoder.layers - delta_layers = delta_model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(device) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(device) - - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(device) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(device) - layers[0] = layers[0].to(device) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=device - ) - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - raise ValueError - layers[0] = Catcher(layers[0]) - for batch in dataloader: - try: - model(batch[0].to(device)) - except ValueError: - pass - layers[0] = layers[0].module - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - original_outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - - logger.info("Ready, creating lr quantizers...") - quantizers = {} - l_quantizers = {} - lr_tensors = {} - # parallelize this to allocate to multiple GPUs - def process_layer(i, device): - layer = delta_layers[i].to(device) - original_layer = layers[i].to(device) - subset = find_layers(layer) - lr_gptq = {} - for name in subset: - lr_gptq[name] = TensorQ(subset[name], rank) - lr_gptq[name].quantizer = Quantizer() - lr_gptq[name].quantizer.configure( - wbits, - perchannel=True, - sym=sym, - mse=False, - trits = trits, - ) - lr_gptq[name].l_quantizer = Quantizer() - lr_gptq[name].l_quantizer.configure( - wbits, - perchannel=True, - sym=sym, - mse=False, - trits = trits, - ) - def add_batch(name): - def temp(_, inp, out): - lr_gptq[name].add_batch_lr(inp[0].data, out.data) - return temp - handles = [] - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) - for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - - original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - - for h in handles: - h.remove() - - for name in subset: - logger.info(f"Quantizing {name}...") - lr_gptq[name].lr_quant( - percdamp = args['percdamp'], - groupsize = args['groupsize'], - actorder = args['actorder'], - ) - lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].R - lr_tensors[f'.model.decoder.layers.{i}.{name}'] = lr_gptq[name].L - - quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].quantizer - l_quantizers[f'model.decoder.layers.{i}.{name}'] = lr_gptq[name].l_quantizer - lr_gptq[name].free() - - for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - layers[i] = layer.cpu() - del layer - del lr_gptq - torch.cuda.empty_cache() - inps, outs = original_outs, inps - num_workers = torch.cuda.device_count() - logger.info(f"Using {num_workers} workers...") - with mp.Pool(num_workers) as p: - p.starmap(process_layer, [(i, f'cuda:{i}') for i in range(num_workers)]) - - model.config.use_cache = use_cache - return quantizers, l_quantizers, lr_tensors \ No newline at end of file diff --git a/opt_delta.py b/opt_delta.py index 401c88b..8195606 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -3,12 +3,13 @@ import torch import pickle import torch.nn as nn - +from pack_utils import pack_to_bits, unpack_from_bits from gptq import * from modelutils import * from quant import * from transformers import AutoTokenizer, AutoModel import copy +from tensorio import TensorIO, model_packing #from prettytable import PrettyTable def get_opt(model): @@ -472,7 +473,7 @@ def sync(): def main(args): print(args) - packed_delta = None + tensor_io = TensorIO('sparse') num_params_saved_lr = 0 num_params = 0 if args.load: @@ -505,7 +506,7 @@ def main(args): comp_time = time.time()-tick else: quantizers = opt_sequential(model, dataloader, DEV) - + print(quantizers) if args.delta and args.wbits<16: for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())): if args.sparsify_hard_threshold: @@ -521,9 +522,10 @@ def main(args): # num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) num_params += torch.numel(finetuned_p.data) # here we save a copy to pack, and save the delta only on disk - packed_delta = copy.deepcopy(finetuned_p.data) finetuned_p.data = (base_p.data + finetuned_p.data).clone() - + if args.save_delta: + new_weights = model_packing(model, quantizers, bits=args.wbits) + torch.save(new_weights, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz") if args.benchmark: gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] if len(gpus) > 1: @@ -649,6 +651,9 @@ def main(args): '--save-hf', action='store_true', default=False, help='Whether to save a huggingface model' ) + parser.add_argument( + '--save-delta', action='store_true', default=False, + ) parser.add_argument( '--fraction_of_zero', type=float, default=0.99, help='Sparsity ratio' diff --git a/pack_utils.py b/pack_utils.py index a68407b..5c6ace9 100644 --- a/pack_utils.py +++ b/pack_utils.py @@ -12,6 +12,8 @@ def pack_to_bits( bits: int, groupsize = 1024 ): + if groupsize == -1: + groupsize = weight.shape[0] if bits not in [2,3,4,8]: raise ValueError("bits must be one of [2,3,4,8]") scales = quantizer.scale.t().contiguous() @@ -125,13 +127,6 @@ def __init__(self, m: torch.Tensor, format: str, minifloats: int=-1) -> None: self.format = format def _convert(self): - if self.minifloats>=2: - quantizer = Quantizer() - quantizer.configure( - self.minifloats, perchannel=True, sym=False, mse=False - ) - quantizer.find_params(self.m, weight=True) - self.m = quantizer.quantize(self.m) # flatten the matrix self.m = self.m.flatten() # get the indices of the non-zero elements diff --git a/pack_utils_test.py b/pack_utils_test.py index a6befb4..a4130e3 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -28,17 +28,18 @@ ) quantizer.find_params(b, weight=True) b_q = quantizer.quantize(b) - sparsed_b_q = hard_threshold(b_q, 0.99) + sparsed_b_q = hard_threshold(b_q, 0.01) q_weight = pack_to_bits(sparsed_b_q, quantizer, QUANTIZED_BITS, groupsize=sparsed_b_q.shape[0]) - sparse_t = SparseTensor(sparsed_b_q, 'wb1', minifloats=-1) + sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1) sparse_t.to_disk('.cache/sparse_b.safetensor') # now load it back restored_sparse_t = SparseTensor.from_disk('.cache/sparse_b.safetensor') restored_weight = restored_sparse_t.tensor + # this is what we restored from disk + restored_weight = unpack_from_bits(restored_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0]) print(f"Original weight: {sparsed_b_q}") print(f"Restored weight: {restored_weight}") - print(torch.allclose(sparsed_b_q, restored_weight)) # count the number of non-zero elements print(f"Original weight: {sparsed_b_q.nonzero().shape[0]}") print(f"Restored weight: {restored_weight.nonzero().shape[0]}") \ No newline at end of file diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh index e73d717..f397417 100644 --- a/scripts/gptq_delta.sh +++ b/scripts/gptq_delta.sh @@ -5,64 +5,65 @@ CUDA_VISIBLE_DEVICES=0 python opt_delta.py \ --delta \ --sparsify_hard_threshold \ --fraction_of_zero 0.95 \ + --save-delta \ --save-hf \ - --groupsize 1024 & + --groupsize 1024 -CUDA_VISIBLE_DEVICES=1 python opt_delta.py \ - --dataset wikitext2 \ - --wbits 3 \ - --delta \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.95 \ - --save-hf \ - --groupsize 1024 & +# CUDA_VISIBLE_DEVICES=1 python opt_delta.py \ +# --dataset wikitext2 \ +# --wbits 3 \ +# --delta \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.95 \ +# --save-hf \ +# --groupsize 1024 & -CUDA_VISIBLE_DEVICES=2 python opt_delta.py \ - --dataset wikitext2 \ - --wbits 4 \ - --delta \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.95 \ - --save-hf \ - --groupsize 1024 & +# CUDA_VISIBLE_DEVICES=2 python opt_delta.py \ +# --dataset wikitext2 \ +# --wbits 4 \ +# --delta \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.95 \ +# --save-hf \ +# --groupsize 1024 & -CUDA_VISIBLE_DEVICES=3 python opt_delta.py \ - --dataset wikitext2 \ - --wbits 2 \ - --delta \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.99 \ - --save-hf \ - --groupsize 1024 & +# CUDA_VISIBLE_DEVICES=3 python opt_delta.py \ +# --dataset wikitext2 \ +# --wbits 2 \ +# --delta \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.99 \ +# --save-hf \ +# --groupsize 1024 & -CUDA_VISIBLE_DEVICES=4 python opt_delta.py \ - --dataset wikitext2 \ - --wbits 3 \ - --delta \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.99 \ - --save-hf \ - --groupsize 1024 & +# CUDA_VISIBLE_DEVICES=4 python opt_delta.py \ +# --dataset wikitext2 \ +# --wbits 3 \ +# --delta \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.99 \ +# --save-hf \ +# --groupsize 1024 & -CUDA_VISIBLE_DEVICES=5 python opt_delta.py \ - --dataset wikitext2 \ - --wbits 4 \ - --delta \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.99 \ - --save-hf \ - --groupsize 1024 & +# CUDA_VISIBLE_DEVICES=5 python opt_delta.py \ +# --dataset wikitext2 \ +# --wbits 4 \ +# --delta \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.99 \ +# --save-hf \ +# --groupsize 1024 & -CUDA_VISIBLE_DEVICES=6 python opt_delta.py \ - --dataset wikitext2 \ - --wbits 3 \ - --delta \ - --save-hf \ - --groupsize 1024 & +# CUDA_VISIBLE_DEVICES=6 python opt_delta.py \ +# --dataset wikitext2 \ +# --wbits 3 \ +# --delta \ +# --save-hf \ +# --groupsize 1024 & -CUDA_VISIBLE_DEVICES=7 python opt_delta.py \ - --dataset wikitext2 \ - --wbits 4 \ - --delta \ - --save-hf \ - --groupsize 1024 & \ No newline at end of file +# CUDA_VISIBLE_DEVICES=7 python opt_delta.py \ +# --dataset wikitext2 \ +# --wbits 4 \ +# --delta \ +# --save-hf \ +# --groupsize 1024 & \ No newline at end of file diff --git a/tensorio.py b/tensorio.py new file mode 100644 index 0000000..26b5fab --- /dev/null +++ b/tensorio.py @@ -0,0 +1,54 @@ +import math +import torch +from safetensors import safe_open +from safetensors.torch import save_model +from modelutils import find_layers +from pack_utils import pack_to_bits + +class TensorIO(): + def __init__(self, format: str, tensors=None) -> None: + self.format = format + if tensors is None: + self.tensors = {} + else: + self.tensors = tensors + def add_tensor(self, idx, tensor): + tensor = tensor.flatten() + # assume that the tensor is sparse + indices = torch.nonzero(tensor) + values = tensor[indices] + self.tensors[f"{idx}_indices"] = indices + self.tensors[f"{idx}_values"] = values + self.tensors[f"{idx}_size"] = torch.tensor(tensor.size()) + + def to_disk(self, path): + torch.save(self.tensors, path) + + @classmethod + def from_disk(cls, path): + tensors = {} + with safe_open(path, framework='pt', device='cpu') as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key) + # restore the tensors + for key in tensors.keys(): + m = torch.zeros(math.prod(tensors[f"{key}_size"]), dtype=tensors[f'{key}_values'].dtype) + + m[tensors[f"{key}_indices"]] = tensors[f"{key}_values"] + tensors[f"{key}_size"] = tensors[f"{key}_size"].tolist() + m = m.reshape(tensors[f"{key}_size"]) + + tensors[key] = m + return cls('sparse', tensors=tensors) + +def model_packing(model, quantizers, bits): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + qlayers = find_layers(model, ) + print('Packing ...') + new_weights = {} + for name in qlayers: + quantizers[name] = quantizers[name].cpu() + new_weights[name] = pack_to_bits(layers[name].weight.data, quantizers[name], bits, groupsize=-1) + print('Done.') + return new_weights \ No newline at end of file From 750dcb018753b5c1c290a8a108f32714c7409c00 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Fri, 12 May 2023 13:07:03 +0000 Subject: [PATCH 18/23] autotuning --- compress_utils.py | 351 +++++++++++++++++++++++ gptq.py | 44 +-- opt_delta.py | 22 +- opt_delta_autotune.py | 632 ++++++++++++++++++++++++++++++++++++++++++ pack_utils_test.py | 2 +- playground.py | 110 ++------ scripts/gptq_delta.sh | 2 +- tensorio.py | 18 +- 8 files changed, 1053 insertions(+), 128 deletions(-) create mode 100644 compress_utils.py create mode 100644 opt_delta_autotune.py diff --git a/compress_utils.py b/compress_utils.py new file mode 100644 index 0000000..20e20f1 --- /dev/null +++ b/compress_utils.py @@ -0,0 +1,351 @@ +import cupy +import math +import torch +import numpy as np +from torch.utils.dlpack import to_dlpack, from_dlpack +from quant import Quantizer +def cupy_to_tensor(x): + return from_dlpack(x.toDlpack()) + +def tensor_to_cupy(x): + return cupy.fromDlpack(to_dlpack(x)) + +def pack_uint8_tensor(x): + if x.device != torch.device('cpu'): + return cupy_to_tensor( + cupy.packbits(tensor_to_cupy(x)) + ) + else: + return torch.from_numpy( + np.packbits(x.numpy()) + ) + +def unpack_uint8_tensor(x): + if x.device != torch.device('cpu'): + return cupy_to_tensor( + cupy.unpackbits(tensor_to_cupy(x)) + ) + else: + return torch.from_numpy( + np.unpackbits(x.numpy()) + ) + +def pack_low_bit_tensor(x, bits): + + if x.device != torch.device('cpu'): + assert x.dtype == torch.uint8 + y = cupy.packbits( + cupy.unpackbits(tensor_to_cupy(x)).reshape(*x.shape, 8)[..., -bits:] + ) + y = cupy_to_tensor(y) + else: + y = np.packbits( + np.unpackbits(x.numpy()).reshape(*x.shape, 8)[..., -bits:] + ) + y = torch.from_numpy(y) + + return y + +def unpack_low_bit_tensor(x, bits, original_shape): + if x.device != torch.device('cpu'): + y = cupy.packbits(cupy.pad( + cupy.unpackbits( + tensor_to_cupy(x) + )[:np.prod(original_shape)*bits].reshape(-1, bits), + ((0,0), (8-bits, 0)) + )) + y = cupy_to_tensor(y).view(original_shape) + else: + y = np.packbits(np.pad( + np.unpackbits( + x.numpy() + )[:np.prod(original_shape)*bits].reshape(-1, bits), + ((0,0), (8-bits, 0)) + )) + y = torch.from_numpy(y).view(original_shape) + return y + + +def pin_memory(array): + mem = cupy.cuda.alloc_pinned_memory(array.nbytes) + ret = np.frombuffer(mem, array.dtype, array.size).reshape(array.shape) + ret[...] = array + return ret + + +def _rounding(x, stochastic=False, minimum_stochastic_distance=0.2): + if stochastic: + x_floor = x.floor() + th = x - x_floor + if minimum_stochastic_distance > 0: + th[th1-minimum_stochastic_distance] = 1. + pr = torch.rand_like(x) + x_floor += (pr < th) + return x_floor + else: + return x.round() + + +def _compress_nbits(x, bits, scale_method='max', scale_dims=(0,1), + stochastic=False, minimum_stochastic_distance=0.2): + + fbits = bits - 1 + + if scale_method == 'max': + # issue: sensitive to outlier points + scale = x.abs().amax(scale_dims, keepdims=True) + elif scale_method == 'l2': + # ~95% confidence interval for normal distribution + scale = x.pow(2).mean(scale_dims, keepdims=True).sqrt() * 2 + else: + raise Exception('unkonwn scale method.') + # fp16 should be enough + scale = scale.half() + x = x / (scale + 1e-6) + + x = x.ldexp(torch.tensor(fbits)) + clip_min = -(1<> 4) + x1 = (x & bitmask) + + x = torch.cat([x0, x1], -1) + + x = _decompress_nbits(x, scale, bits=4) + + return x + + +def compress_2bit(x, scale_method='max', scale_dims=(0,1)): + + x, scale = _compress_nbits(x, bits=2, scale_method=scale_method, scale_dims=scale_dims) + + x0, x1, x2, x3 = x.chunk(4, -1) + x = (x0 << 6) + (x1 << 4) + (x2 << 2) + x3 + + return x, scale + + +def decompress_2bit(x, scale): + + bitmask = 3 + + x0 = (x >> 6) + x1 = (x >> 4) & bitmask + x2 = (x >> 2) & bitmask + x3 = x & bitmask + x = torch.cat([x0, x1, x2, x3], -1) + + x = _decompress_nbits(x, scale, bits=2) + + return x + + + +def compress_flexible_nbits(x, bits, scale_method='max', scale_dims=(0,1)): + # support any bits + # CUDA only + + x, scale = _compress_nbits(x, bits=bits, scale_method=scale_method, scale_dims=scale_dims) + + x = pack_low_bit_tensor(x, bits) + + return x, scale + + +def decompress_flexible_nbits(x, scale, bits, original_shape): + # support any bits, but need to know original_shape + # CUDA only + + x = unpack_low_bit_tensor(x, bits, original_shape) + + x = _decompress_nbits(x, scale, bits=bits) + + return x + + + +def compress_nbits(x, bits, scale_method='max', scale_dims=(0,1)): + if bits == 8: + x, scale = compress_8bit(x, scale_method=scale_method, scale_dims=scale_dims) + elif bits == 4: + x, scale = compress_4bit(x, scale_method=scale_method, scale_dims=scale_dims) + elif bits == 2: + x, scale = compress_2bit(x, scale_method=scale_method, scale_dims=scale_dims) + + return x, scale + + +def decompress_nbits(x, scale, bits): + if bits == 8: + y = decompress_8bit(x, scale) + elif bits == 4: + y = decompress_4bit(x, scale) + elif bits == 2: + y = decompress_2bit(x, scale) + + return y + +def _compress_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512, + stochastic=False, minimum_stochastic_distance=0.2): + + if bits == 1: + + x = x.view(bucket_size, -1) + + scale = (x.norm(dim=0) / (bucket_size**0.5)).half() + + x = (x >= 0) + + x = x.type(torch.uint8) + + return x, scale + + + fbits = bits - 1 + + x = x.view(bucket_size, -1) + + if scale_method == 'max': + # issue: sensitive to outlier points + scale = x.abs().amax([0], keepdims=True) + elif scale_method == 'l2': + # ~95% confidence interval for normal distribution + scale = x.pow(2).mean([0], keepdims=True).sqrt() * 2 + else: + raise Exception('unkonwn scale method.') + # fp16 should be enough + scale = scale.half() + x = x / (scale + 1e-6) + + x = x.ldexp(torch.tensor(fbits)) + clip_min = -(1< x.numel(): + bucket_size = x.numel() + + x, scale = _compress_nbits_by_bucket( + x, bits=bits, scale_method=scale_method, bucket_size=bucket_size, + stochastic=stochastic, minimum_stochastic_distance=minimum_stochastic_distance) + + x = pack_low_bit_tensor(x, bits) + + return x, scale + + +def decompress_flexible_nbits_by_bucket(x, scale, bits, original_shape, bucket_size=512): + # support any bits, but need to know original_shape + # CUDA only + + + numel = math.prod(original_shape) + if bucket_size > numel: + bucket_size = numel + + + if bits == 1: + + x = unpack_low_bit_tensor(x, bits, original_shape) + x = x.view(bucket_size, -1) + x = (x.half() - 0.5)*2 + x = x * scale.unsqueeze(0) + x = x.view(original_shape) + + # print('done') + + return x + + x = unpack_low_bit_tensor(x, bits, original_shape) + + x = x.view(bucket_size, -1) + x = _decompress_nbits(x, scale, bits=bits) + x = x.view(original_shape) + + return x + +if __name__=="__main__": + + x = torch.randn((512, 512), dtype=torch.float32, device='cuda') + + print("original") + print(x) + quantizer = Quantizer() + quantizer.configure( + 4, perchannel=True, sym=False, mse=False + ) + quantizer.find_params(x, weight=True) + b_q = quantizer.quantize(x) + + packed_x, scale = compress_flexible_nbits(b_q, 4) + unpacked_x = decompress_flexible_nbits(packed_x,scale=scale, bits=4, original_shape=x.shape) + + print("unpacked") + print(unpacked_x) + print(f"are they equal? {torch.allclose(x, unpacked_x)}") \ No newline at end of file diff --git a/gptq.py b/gptq.py index 8f719e1..f57edb0 100644 --- a/gptq.py +++ b/gptq.py @@ -11,9 +11,18 @@ torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False +def hard_threshold(x, fraction_of_zero=0.1): + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + return mask * x + class GPTQ: def __init__(self, layer): self.layer = layer + self.original_weight = layer.weight.data.clone() self.dev = self.layer.weight.device W = layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): @@ -26,26 +35,16 @@ def __init__(self, layer): self.nsamples = 0 def add_batch(self, inp, out): - if DEBUG: - self.inp1 = inp - self.out1 = out + self.inp1 = inp + self.out1 = out if len(inp.shape) == 2: inp = inp.unsqueeze(0) tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + if isinstance(self.layer, nn.Linear): if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() - if isinstance(self.layer, nn.Conv2d): - unfold = nn.Unfold( - self.layer.kernel_size, - dilation=self.layer.dilation, - padding=self.layer.padding, - stride=self.layer.stride - ) - inp = unfold(inp) - inp = inp.permute([1, 0, 2]) - inp = inp.flatten(1) + self.H *= self.nsamples / (self.nsamples + tmp) self.nsamples += tmp # inp = inp.float() @@ -54,7 +53,7 @@ def add_batch(self, inp, out): self.H += inp.matmul(inp.t()) def fasterquant( - self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False + self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=-1 ): W = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): @@ -69,7 +68,8 @@ def fasterquant( self.quantizer.find_params(W, weight=True) H = self.H - del self.H + if write: + del self.H dead = torch.diag(H) == 0 H[dead, dead] = 1 W[:, dead] = 0 @@ -141,9 +141,15 @@ def fasterquant( if isinstance(self.layer, transformers.Conv1D): Q = Q.t() - self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) - if DEBUG: - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + # here report the loss of the quantized layer vs. the original layer + new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype) + if sparsity >= 0: + sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=sparsity) + else: + sparsed_new_weight = new_weight + if write: + self.layer.weight.data = sparsed_new_weight + return torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) def free(self): if DEBUG: diff --git a/opt_delta.py b/opt_delta.py index 8195606..38e3547 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -42,7 +42,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev): layers = model.model.decoder.layers delta_layers = delta_model.model.decoder.layers - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: @@ -506,13 +506,13 @@ def main(args): comp_time = time.time()-tick else: quantizers = opt_sequential(model, dataloader, DEV) - print(quantizers) + if args.delta and args.wbits<16: for idx, (base_p, finetuned_p) in enumerate(zip(base_model.parameters(), model.parameters())): - if args.sparsify_hard_threshold: - print('Hard Thresholding...') - W = finetuned_p.data - finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + # if args.sparsify_hard_threshold: + # print('Hard Thresholding...') + # W = finetuned_p.data + # finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) # if args.rank>0 and len(finetuned_p.shape) == 2: # print('Finding Low Rank Approximation...') # A = finetuned_p.data.float() @@ -521,11 +521,15 @@ def main(args): # finetuned_p.data = A.half() # num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) num_params += torch.numel(finetuned_p.data) - # here we save a copy to pack, and save the delta only on disk finetuned_p.data = (base_p.data + finetuned_p.data).clone() + if args.save_delta: - new_weights = model_packing(model, quantizers, bits=args.wbits) - torch.save(new_weights, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz") + new_weights, scale = model_packing(model, quantizers, bits=args.wbits) + torch.save({ + 'weight': new_weights, + 'scale': scale, + }, f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz") + if args.benchmark: gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] if len(gpus) > 1: diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py new file mode 100644 index 0000000..db2b8a3 --- /dev/null +++ b/opt_delta_autotune.py @@ -0,0 +1,632 @@ +import copy +import time +import json +import torch +import pickle +from gptq import * +from quant import * +import torch.nn as nn +from modelutils import * +from loguru import logger +from tensorio import TensorIO, model_packing +from transformers import AutoTokenizer, AutoModel +# from prettytable import PrettyTable + +def get_opt(model): + import torch + + def skip(*args, **kwargs): + pass + + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import OPTForCausalLM + + # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') + model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + model.seqlen = model.config.max_position_embeddings + return model + + +def hard_threshold(x, fraction_of_zero=0.1): + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + return mask * x + + +@torch.no_grad() +def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.1): + search_space = { + "wbits": [2,3,4], + "sparsities": [-1, 0.33, 0.5, 0.67, 0.9] + } + base_floats = 16 + compression_rates = {} + for wbit in search_space['wbits']: + for sparsity in search_space['sparsities']: + compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) if sparsity >=0 else base_floats / wbit + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + delta_layers = delta_model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {"i": 0, "attention_mask": None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, inp, **kwargs): + inps[cache["i"]] = inp + cache["i"] += 1 + cache["attention_mask"] = kwargs["attention_mask"] + raise ValueError + + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache["attention_mask"] + + print("Ready.") + tuned_params = {} + tuned_configs = {} + quantizers = {} + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) + subset = find_layers(layer) + for name in subset: + tuned_params[f'{i}_{name}'] = {} + tuned_configs[f'{i}_{name}'] = {} + for wbit in search_space['wbits']: + for sparsity in search_space['sparsities']: + tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = { + 'gptq': GPTQ(subset[name]) + } + + tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer = Quantizer() + + tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer.configure( + wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits + ) + + def add_batch(name): + def tmp(_, inp, out): + for wbit in search_space['wbits']: + for sparsity in search_space['sparsities']: + tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].add_batch(inp[0].data, out.data) + return tmp + + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer( + inps[j].unsqueeze(0), attention_mask=attention_mask + )[0] + + for h in handles: + h.remove() + + for name in subset: + logger.info(f"Quantizing {i}.{name} ...") + for wbit in search_space['wbits']: + for sparsity in search_space['sparsities']: + loss=tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].fasterquant( + percdamp=args.percdamp, + groupsize=args.groupsize, + actorder=args.act_order, + write=False + ) + tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = { + 'loss': loss.item() + } + logger.info(f"wbit: {wbit}; sparsity: {sparsity}; loss: {loss}") + # within the tol, pick the minimal wbit and maximal sparsity + best_wbit = None + best_sparsity = None + best_loss = None + # starting from the minimal compression rate + compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True) + # loop through all compression rates: + for cr in compression_rates: + config = cr[0] + wbit = int(config.split('_')[0].split('.')[1]) + sparsity = float(config.split('_')[1].replace('sparsity.','')) + # find the corresponding loss + loss = tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['loss'] + # if the loss is within the tolerance + if loss <= tol: + best_wbit = wbit + best_sparsity = sparsity + break + # if not, pick the lowest compression rate + if best_wbit is None: + best_wbit = int(compression_rates[-1][0].split('_')[0].split('.')[1]) + best_sparsity = float(compression_rates[-1][0].split('_')[1].replace('sparsity.','')) + + best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss'] + # redo the actual work + logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...") + tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].fasterquant( + percdamp=args.percdamp, + groupsize=args.groupsize, + actorder=args.act_order, + write=True + ) + quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{sparsity}']['gptq'].quantizer + tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].free() + tuned_configs[f'{i}_{name}']['choice'] = { + 'best_wbit': best_wbit, + 'best_sparsity': best_sparsity, + 'best_loss': best_loss + } + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer( + inps[j].unsqueeze(0), attention_mask=attention_mask + )[0] + + layers[i] = layer.cpu() + del layer + for key in tuned_params.keys(): + if key.startswith(f'{i}_'): + for wbit in search_space['wbits']: + for sparsity in search_space['sparsities']: + del tuned_params[key][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'] + torch.cuda.empty_cache() + + inps, outs = original_outs, inps + + model.config.use_cache = use_cache + + return quantizers, tuned_configs + +@torch.no_grad() +def opt_eval(model, testenc, dev): + print("Evaluating ...") + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {"i": 0, "attention_mask": None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, inp, **kwargs): + inps[cache["i"]] = inp + cache["i"] += 1 + cache["attention_mask"] = kwargs["attention_mask"] + raise ValueError + + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache["attention_mask"] + + for i in range(len(layers)): + # print(i) + layer = layers[i].to(dev) + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.decoder.final_layer_norm is not None: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to( + dev + ) + if model.model.decoder.project_out is not None: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.decoder.final_layer_norm is not None: + hidden_states = model.model.decoder.final_layer_norm(hidden_states) + if model.model.decoder.project_out is not None: + hidden_states = model.model.decoder.project_out(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) + ) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + return ppl.item() + + +# TODO: perform packing on GPU +def opt_pack3(model, quantizers): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant3(model, quantizers, faster=args.faster_kernel) + qlayers = find_layers(model, [Quant3Linear]) + print("Packing ...") + for name in qlayers: + print(name) + quantizers[name] = quantizers[name].cpu() + qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero) + print("Done.") + return model + + +def load_quant3(model, checkpoint): + from transformers import OPTConfig, OPTForCausalLM + + config = OPTConfig.from_pretrained(model) + + def noop(*args, **kwargs): + pass + + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = OPTForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ["model.decoder.project_out", "model.decoder.project_in", "lm_head"]: + if name in layers: + del layers[name] + make_quant3(model, layers, faster=args.faster_kernel) + + print("Loading model ...") + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = model.config.max_position_embeddings + print("Done.") + + return model + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else DEV) + torch.cuda.synchronize() + cache = {"past": None} + + def clear_past(i): + def tmp(layer, inp, out): + if cache["past"]: + cache["past"][i] = None + + return tmp + + for i, layer in enumerate(model.model.decoder.layers): + layer.register_forward_hook(clear_past(i)) + + print("Benchmarking ...") + + if check: + loss = nn.CrossEntropyLoss() + tot = 0.0 + + def sync(): + if hasattr(model, "gpus"): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + out = model( + input_ids[:, i].reshape(-1), + past_key_values=cache["past"], + attention_mask=attention_mask[:, : (i + 1)].reshape((1, -1)), + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + if check and i != input_ids.numel() - 1: + tot += loss( + out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV) + ).float() + cache["past"] = list(out.past_key_values) + del out + sync() + import numpy as np + + print("Median:", np.median(times)) + if check: + print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item()) + + +def main(args): + print(args) + num_params = 0 + if args.load: + model = load_quant3(args.model, args.load) + else: + if args.delta and args.wbits < 16: + model = get_opt(args.model) + model.eval() + base_model = get_opt(args.base_model) + base_model.eval() + dataloader, testloader = get_loaders( + args.dataset, + nsamples=args.nsamples, + seed=args.seed, + model=args.model, + seqlen=model.seqlen, + ) + original_finetuned_model = copy.deepcopy(model) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data - base_p.data).clone() + else: + model = get_opt(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, + nsamples=args.nsamples, + seed=args.seed, + model=args.model, + seqlen=model.seqlen, + ) + + if args.wbits < 16: + if args.delta: + tick = time.time() + quantizers, tuned_params = opt_sequential_delta( + original_finetuned_model, model, dataloader, DEV + ) + with open(".cache/tuned_params.json", "w+") as f: + json.dump(tuned_params, f) + comp_time = time.time() - tick + else: + raise NotImplementedError + if args.delta and args.wbits < 16: + for idx, (base_p, finetuned_p) in enumerate( + zip(base_model.parameters(), model.parameters()) + ): + num_params += torch.numel(finetuned_p.data) + finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + if args.save_delta: + new_weights, scale = model_packing(model, quantizers, bits=args.wbits) + torch.save( + { + "weight": new_weights, + "scale": scale, + }, + f"delta_outputs/{args.model.replace('/', '.')}_delta_{args.wbits}bits_sparsify.{args.sparsify_hard_threshold}_{args.fraction_of_zero}foz", + ) + + if args.benchmark: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, : args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + dataset = args.dataset + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + ppl = opt_eval(model, testloader, DEV) + print(ppl) + + if args.save_hf: + if args.delta: + hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune" + else: + hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits" + model.save_pretrained(hf_path) + tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer.save_pretrained(hf_path) + else: + opt_pack3(model, quantizers) + torch.save(model.state_dict(), args.save) + + +if __name__ == "__main__": + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model", + type=str, + default="lnair/opt-1.3b-wikitext2", + help="OPT model to load; pass `facebook/opt-X`.", + ) + parser.add_argument( + "--dataset", + type=str, + choices=["wikitext2", "ptb", "c4"], + default="wikitext2", + help="Where to extract calibration data from.", + ) + parser.add_argument( + "--base-model", + type=str, + default="facebook/opt-1.3b", + help="base OPT model to load", + ) + parser.add_argument( + "--seed", type=int, default=0, help="Seed for sampling the calibration data." + ) + parser.add_argument( + "--nsamples", type=int, default=128, help="Number of calibration data samples." + ) + parser.add_argument( + "--percdamp", + type=float, + default=0.01, + help="Percent of the average Hessian diagonal to use for dampening.", + ) + parser.add_argument( + "--wbits", + type=int, + default=2, + choices=[2, 3, 4, 16], + help="#bits to use for quantization; use 16 for evaluating base model.", + ) + parser.add_argument( + "--trits", action="store_true", help="Whether to use trits for quantization." + ) + parser.add_argument( + "--groupsize", + type=int, + default=-1, + help="Groupsize to use for quantization; default uses full row.", + ) + parser.add_argument( + "--sym", action="store_true", help="Whether to perform symmetric quantization." + ) + parser.add_argument( + "--save", + type=str, + default="", + help="Save quantized checkpoint under this name.", + ) + parser.add_argument("--load", type=str, default="", help="Load quantized model.") + parser.add_argument( + "--benchmark", + type=int, + default=0, + help="Number of tokens to use for benchmarking.", + ) + parser.add_argument( + "--check", + action="store_true", + help="Whether to compute perplexity during benchmarking for verification.", + ) + parser.add_argument( + "--new-eval", + action="store_true", + help="Whether to use the new PTB and C4 eval.", + ) + parser.add_argument( + "--faster-kernel", + action="store_true", + help="Whether to use the new faster kernel for benchmarking.", + ) + parser.add_argument( + "--act-order", + action="store_true", + help="Whether to apply the activation order GPTQ heuristic", + ) + parser.add_argument( + "--delta", action="store_true", help="Whether to use delta compression" + ) + parser.add_argument( + "--sparsify_hard_threshold", action="store_true", help="Whether to add sparsity" + ) + parser.add_argument( + "--save-hf", + action="store_true", + default=False, + help="Whether to save a huggingface model", + ) + parser.add_argument( + "--save-delta", + action="store_true", + default=False, + ) + parser.add_argument( + "--fraction_of_zero", type=float, default=0.99, help="Sparsity ratio" + ) + parser.add_argument( + "--rank", + type=int, + default=0, + help="The rank to use for decomposing each matrices", + ) + args = parser.parse_args() + + # results = PrettyTable() + + main(args) + + print("finished.") diff --git a/pack_utils_test.py b/pack_utils_test.py index a4130e3..58f1ad0 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -5,7 +5,7 @@ from safetensors.torch import save_file from opt_delta import hard_threshold -QUANTIZED_BITS = 3 +QUANTIZED_BITS = 4 if __name__=="__main__": """ diff --git a/playground.py b/playground.py index ebeecac..60557c9 100644 --- a/playground.py +++ b/playground.py @@ -1,90 +1,20 @@ -import torch -import numpy as np -import torch.nn as nn - -def quantize(x, scale, zero, maxq): - if maxq < 0: - return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero - q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) - return scale * (q - zero) - -class Quantizer(nn.Module): - - def __init__(self, shape=1): - super(Quantizer, self).__init__() - self.register_buffer('maxq', torch.tensor(0)) - self.register_buffer('scale', torch.zeros(shape)) - self.register_buffer('zero', torch.zeros(shape)) - - def configure( - self, - bits, perchannel=False, sym=True, - mse=False, norm=2.4, grid=100, maxshrink=.8, - ): - self.maxq = torch.tensor(2 ** bits - 1) - self.perchannel = perchannel - self.sym = sym - self.mse = mse - self.norm = norm - self.grid = grid - self.maxshrink = maxshrink - - def find_params(self, x, weight=False): - dev = x.device - self.maxq = self.maxq.to(dev) - shape = x.shape - if self.perchannel: - if weight: - x = x.flatten(1) - - tmp = torch.zeros(x.shape[0], device=dev) - xmin = torch.minimum(x.min(1)[0], tmp) - xmax = torch.maximum(x.max(1)[0], tmp) - - if self.sym: - xmax = torch.maximum(torch.abs(xmin), xmax) - tmp = xmin < 0 - if torch.any(tmp): - xmin[tmp] = -xmax[tmp] - tmp = (xmin == 0) & (xmax == 0) - xmin[tmp] = -1 - xmax[tmp] = +1 - - if self.maxq < 0: - self.scale = xmax - self.zero = xmin - else: - self.scale = (xmax - xmin) / self.maxq - if self.sym: - self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) - else: - self.zero = torch.round(-xmin / self.scale) - - if weight: - shape = [-1] + [1] * (len(shape) - 1) - self.scale = self.scale.reshape(shape) - self.zero = self.zero.reshape(shape) - return - - def quantize(self, x): - if self.ready(): - return quantize(x, self.scale, self.zero, self.maxq) - return x - - def enabled(self): - return self.maxq > 0 - - def ready(self): - return torch.all(self.scale != 0) - -q = torch.tensor([[1,2,3,4,5,6,7,8]]).float() -quantizer = Quantizer() -quantizer.configure( - 3, perchannel=True, sym=False, mse=False -) -quantizer.find_params(q, weight=True) -b_q = quantizer.quantize(q) -# now since b_q is 8 3-bit floats, we can pack them into 3 8-bit integers -packed_b_q = torch.zeros(3, dtype=torch.uint8) -for i in range(3): - packed_b_q[i] = b_q[0][i*8:(i+1)*8].byte().sum() +base_floats = 16 +search_space = { + "wbits": [2,3,4], + "sparsities": [-1, 0.33, 0.5, 0.67, 0.9] +} +compression_rates = {} +for wbit in search_space['wbits']: + for sparsity in search_space['sparsities']: + if sparsity == -1: + sparsity = 0 + compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) + +compression_rates = sorted(compression_rates.items(), key=lambda x: x[1]) + +for cr in compression_rates: + config = cr[0] + print(config) + wbit = int(config.split('_')[0].split('.')[1]) + sparsity = float(config.split('_')[1].replace('sparsity.','')) + print(f'wbit: {wbit}, sparsity: {sparsity}, compression rate: {cr[1]}') \ No newline at end of file diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh index f397417..cb93a55 100644 --- a/scripts/gptq_delta.sh +++ b/scripts/gptq_delta.sh @@ -1,5 +1,5 @@ ts -S 8 -CUDA_VISIBLE_DEVICES=0 python opt_delta.py \ +CUDA_VISIBLE_DEVICES=0 python opt_delta_autotune.py \ --dataset wikitext2 \ --wbits 2 \ --delta \ diff --git a/tensorio.py b/tensorio.py index 26b5fab..46cdd2f 100644 --- a/tensorio.py +++ b/tensorio.py @@ -4,6 +4,7 @@ from safetensors.torch import save_model from modelutils import find_layers from pack_utils import pack_to_bits +from compress_utils import compress_flexible_nbits, decompress_flexible_nbits class TensorIO(): def __init__(self, format: str, tensors=None) -> None: @@ -12,6 +13,7 @@ def __init__(self, format: str, tensors=None) -> None: self.tensors = {} else: self.tensors = tensors + def add_tensor(self, idx, tensor): tensor = tensor.flatten() # assume that the tensor is sparse @@ -20,7 +22,7 @@ def add_tensor(self, idx, tensor): self.tensors[f"{idx}_indices"] = indices self.tensors[f"{idx}_values"] = values self.tensors[f"{idx}_size"] = torch.tensor(tensor.size()) - + def to_disk(self, path): torch.save(self.tensors, path) @@ -33,7 +35,7 @@ def from_disk(cls, path): # restore the tensors for key in tensors.keys(): m = torch.zeros(math.prod(tensors[f"{key}_size"]), dtype=tensors[f'{key}_values'].dtype) - + m[tensors[f"{key}_indices"]] = tensors[f"{key}_values"] tensors[f"{key}_size"] = tensors[f"{key}_size"].tolist() m = m.reshape(tensors[f"{key}_size"]) @@ -41,14 +43,14 @@ def from_disk(cls, path): tensors[key] = m return cls('sparse', tensors=tensors) -def model_packing(model, quantizers, bits): +def model_packing(model, quantizers, bits, reformat='none'): layers = find_layers(model) layers = {n: layers[n] for n in quantizers} qlayers = find_layers(model, ) print('Packing ...') - new_weights = {} for name in qlayers: - quantizers[name] = quantizers[name].cpu() - new_weights[name] = pack_to_bits(layers[name].weight.data, quantizers[name], bits, groupsize=-1) - print('Done.') - return new_weights \ No newline at end of file + if name in quantizers: + quantizers[name] = quantizers[name].cpu() + x, scale = compress_flexible_nbits(layers[name].weight.data.cuda(), bits) + return x, scale + From dc05d074f0568f13afd92b16e90558cbce9761ab Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Fri, 12 May 2023 15:06:12 +0000 Subject: [PATCH 19/23] auto tuning --- gptq.py | 20 ++++-- opt_delta_autotune.py | 61 +++++++++-------- replay.py | 0 scripts/gptq_delta.sh | 4 +- scripts/playground.ipynb | 143 ++------------------------------------- 5 files changed, 50 insertions(+), 178 deletions(-) create mode 100644 replay.py diff --git a/gptq.py b/gptq.py index f57edb0..f798eda 100644 --- a/gptq.py +++ b/gptq.py @@ -3,7 +3,7 @@ import torch import transformers import torch.nn as nn - +from loguru import logger from quant import quantize DEBUG = False @@ -16,7 +16,7 @@ def hard_threshold(x, fraction_of_zero=0.1): num_params = torch.numel(x) thresh_index = int(num_params * fraction_of_zero) threshold = y[thresh_index] - mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor) return mask * x class GPTQ: @@ -53,7 +53,7 @@ def add_batch(self, inp, out): self.H += inp.matmul(inp.t()) def fasterquant( - self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=-1 + self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None ): W = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): @@ -143,13 +143,19 @@ def fasterquant( Q = Q.t() # here report the loss of the quantized layer vs. the original layer new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype) - if sparsity >= 0: - sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=sparsity) - else: + losses = {} + if sparsity is None: sparsed_new_weight = new_weight + losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + else: + for s_sity in sparsity: + if write: + logger.info(f"HT with: sparsity={s_sity}") + sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity) + losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) if write: self.layer.weight.data = sparsed_new_weight - return torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + return losses def free(self): if DEBUG: diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py index db2b8a3..02f6334 100644 --- a/opt_delta_autotune.py +++ b/opt_delta_autotune.py @@ -39,16 +39,18 @@ def hard_threshold(x, fraction_of_zero=0.1): @torch.no_grad() -def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.1): +def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2): search_space = { "wbits": [2,3,4], - "sparsities": [-1, 0.33, 0.5, 0.67, 0.9] + "sparsities": [0.0, 0.33, 0.5, 0.67, 0.9, 0.95] } base_floats = 16 compression_rates = {} for wbit in search_space['wbits']: for sparsity in search_space['sparsities']: compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) if sparsity >=0 else base_floats / wbit + compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True) + use_cache = model.config.use_cache model.config.use_cache = False layers = model.model.decoder.layers @@ -113,22 +115,20 @@ def forward(self, inp, **kwargs): tuned_params[f'{i}_{name}'] = {} tuned_configs[f'{i}_{name}'] = {} for wbit in search_space['wbits']: - for sparsity in search_space['sparsities']: - tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = { - 'gptq': GPTQ(subset[name]) - } + tuned_params[f'{i}_{name}'][f'wbit.{wbit}'] = { + 'gptq': GPTQ(subset[name]) + } - tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer = Quantizer() + tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer = Quantizer() - tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].quantizer.configure( - wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits - ) + tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].quantizer.configure( + wbit, perchannel=True, sym=args.sym, mse=False, trits=args.trits + ) def add_batch(name): def tmp(_, inp, out): for wbit in search_space['wbits']: - for sparsity in search_space['sparsities']: - tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].add_batch(inp[0].data, out.data) + tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].add_batch(inp[0].data, out.data) return tmp handles = [] @@ -147,23 +147,23 @@ def tmp(_, inp, out): for name in subset: logger.info(f"Quantizing {i}.{name} ...") for wbit in search_space['wbits']: - for sparsity in search_space['sparsities']: - loss=tuned_params[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'].fasterquant( - percdamp=args.percdamp, - groupsize=args.groupsize, - actorder=args.act_order, - write=False - ) - tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{sparsity}'] = { - 'loss': loss.item() + losses=tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant( + percdamp=args.percdamp, + groupsize=args.groupsize, + actorder=args.act_order, + sparsity = search_space['sparsities'], + write=False, + ) + for s_sity in losses.keys(): + tuned_configs[f'{i}_{name}'][f'wbit.{wbit}_sparsity.{s_sity}'] = { + 'loss': losses[s_sity].item() } - logger.info(f"wbit: {wbit}; sparsity: {sparsity}; loss: {loss}") + logger.info(f"wbit: {wbit}; sparsity: {s_sity}; loss: {losses[s_sity].item()}") # within the tol, pick the minimal wbit and maximal sparsity best_wbit = None best_sparsity = None best_loss = None # starting from the minimal compression rate - compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True) # loop through all compression rates: for cr in compression_rates: config = cr[0] @@ -180,18 +180,20 @@ def tmp(_, inp, out): if best_wbit is None: best_wbit = int(compression_rates[-1][0].split('_')[0].split('.')[1]) best_sparsity = float(compression_rates[-1][0].split('_')[1].replace('sparsity.','')) - + if best_sparsity == -1: + best_sparsity = -1 best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss'] # redo the actual work logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...") - tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].fasterquant( + tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant( percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, - write=True + write=True, + sparsity = [best_sparsity], ) - quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{sparsity}']['gptq'].quantizer - tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['gptq'].free() + quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].quantizer + tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].free() tuned_configs[f'{i}_{name}']['choice'] = { 'best_wbit': best_wbit, 'best_sparsity': best_sparsity, @@ -208,8 +210,7 @@ def tmp(_, inp, out): for key in tuned_params.keys(): if key.startswith(f'{i}_'): for wbit in search_space['wbits']: - for sparsity in search_space['sparsities']: - del tuned_params[key][f'wbit.{wbit}_sparsity.{sparsity}']['gptq'] + del tuned_params[key][f'wbit.{wbit}']['gptq'] torch.cuda.empty_cache() inps, outs = original_outs, inps diff --git a/replay.py b/replay.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh index cb93a55..0795956 100644 --- a/scripts/gptq_delta.sh +++ b/scripts/gptq_delta.sh @@ -2,9 +2,9 @@ ts -S 8 CUDA_VISIBLE_DEVICES=0 python opt_delta_autotune.py \ --dataset wikitext2 \ --wbits 2 \ + --base-model facebook/opt-2.7b \ + --model lnair/opt-2.7b-wikitext2 \ --delta \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.95 \ --save-delta \ --save-hf \ --groupsize 1024 diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb index 9939233..1751996 100644 --- a/scripts/playground.ipynb +++ b/scripts/playground.ipynb @@ -1,150 +1,15 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "seed=42\n", - "target_model_name = \"lnair/opt-1.3b-wikitext2\"\n", - "base_model_name = \"facebook/opt-1.3b\"\n", - "n_samples = 128\n", - "dataset = 'wikitext2'\n", - "sys.path.append('..')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/xiayao/miniconda3/envs/fmzip/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "data": { - "text/plain": [ - "OPTForCausalLM(\n", - " (model): OPTModel(\n", - " (decoder): OPTDecoder(\n", - " (embed_tokens): Embedding(50272, 2048, padding_idx=1)\n", - " (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)\n", - " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " (layers): ModuleList(\n", - " (0-23): 24 x OPTDecoderLayer(\n", - " (self_attn): OPTAttention(\n", - " (k_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (v_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", - " )\n", - " (activation_fn): ReLU()\n", - " (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n", - " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n", - " (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (lm_head): Linear(in_features=2048, out_features=50272, bias=False)\n", - ")" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from modelutils import get_opt\n", - "base_model = get_opt(base_model_name)\n", - "target_model = get_opt(target_model_name)\n", - "base_model.to('cuda')\n", - "target_model.to('cuda')\n", - "base_model.eval()\n", - "target_model.eval()" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n", - "Found cached dataset wikitext (/home/xiayao/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n" - ] - } - ], - "source": [ - "from datautils import get_loaders\n", - "trainloader, loader_enc = get_loaders(\n", - " dataset,\n", - " nsamples = n_samples,\n", - " seed=seed,\n", - " model=target_model_name,\n", - " seqlen=base_model.seqlen,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from cli import quantize_with_lowrank\n", - "r_quantizer, l_quantizer, lr_tensors = quantize_with_lowrank(\n", - " base_model,\n", - " target_model,\n", - " trainloader,\n", - " 32\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." - ] - } - ], "source": [ - "import torch\n", - "from safetensors import safe_open\n", - "from safetensors.torch import save_file\n", - "\n", - "# iterate over all keys in lr_tensors\n", - "for k in lr_tensors.keys():\n", - " lr_tensors[k] = lr_tensors[k].contiguous() # make sure they are contiguous\n", - "# save them to a file\n", - "\n", - "save_file(lr_tensors, \"model.safetensors\")" + "import json\n", + "with open(\"../.cache/tuned_params.json\", \"r\") as fp:\n", + " data = json.load(fp)\n", + "print(data.keys())" ] } ], From 7ce5f6cc625cbca0b8759e79124fdaea6e739528 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Sat, 13 May 2023 18:14:36 +0000 Subject: [PATCH 20/23] packing, bitmap, etc.... --- .gitignore | 3 +- docs/number.md | 20 ++++++ gptq.py | 20 ++++-- opt_delta_autotune.py | 21 +++--- playground.py | 90 ++++++++++++++++++++----- scripts/gptq_delta.sh | 69 ++----------------- scripts/playground.ipynb | 7 +- submit.py | 24 ++----- utilities/analyze.py | 16 +++++ utilities/compression_rate_estimator.py | 53 +++++++++++++++ utilities/tuning_analyser.py | 6 ++ 11 files changed, 208 insertions(+), 121 deletions(-) create mode 100644 utilities/analyze.py create mode 100644 utilities/compression_rate_estimator.py create mode 100644 utilities/tuning_analyser.py diff --git a/.gitignore b/.gitignore index 87c07aa..f73b20e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ outputs/ outputs_past/ packed_delta .cache -delta_outputs/ \ No newline at end of file +delta_outputs/ +.io/ \ No newline at end of file diff --git a/docs/number.md b/docs/number.md index e69de29..ec5bac9 100644 --- a/docs/number.md +++ b/docs/number.md @@ -0,0 +1,20 @@ +In theory: + With a matrix of size 2048 * 2048, 10% elements are non-zero. The original bits is 2048 * 2048 * 16 = 16 * 4M + + To store the indices of non-zero elements, it takes 2048 * 2048 * 10% * log2(2048 * 2048) ~= 2.2 * 4M + + Considering indices only, we achieve 16 / 2.2 ~= 7.3x compression ratio + +In practice: + Saving a matrix of size 2048 * 2048, 10% elements are non-zero takes 8M bytes on disk (with torch.save). + + Saving packed indices takes 1.9M on disk, achieving 17 / 1.9 ~= 8.9x compression ratio. + + With zip, the packed indices takes 1.1M on disk, achieving 17 / 1.1 ~= 15.5x compression ratio. + + +256 x 256 -> 64k fp 16 -> 128k on disk ok. +log2(2048*2048) = 22. 3 int8 for each index. + +0.4M * 3 = 1.2M +""" \ No newline at end of file diff --git a/gptq.py b/gptq.py index f798eda..34e987d 100644 --- a/gptq.py +++ b/gptq.py @@ -6,12 +6,14 @@ from loguru import logger from quant import quantize -DEBUG = False +DEBUG = False torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False def hard_threshold(x, fraction_of_zero=0.1): + if fraction_of_zero == 0: + return x y, _ = torch.sort(x.view(-1).abs().clone()) num_params = torch.numel(x) thresh_index = int(num_params * fraction_of_zero) @@ -124,15 +126,16 @@ def fasterquant( W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) if DEBUG: - self.layer.weight.data[:, :i2] = Q[:, :i2] - self.layer.weight.data[:, i2:] = W[:, i2:] - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - print(torch.sum(Losses)) + pass + #self.layer.weight.data[:, :i2] = Q[:, :i2] + #self.layer.weight.data[:, i2:] = W[:, i2:] + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + #print(torch.sum(Losses)) torch.cuda.synchronize() total_time = time.time() - tick # print('time %.2f' % total_time) - error = torch.sum(Losses).item() + # error = torch.sum(Losses).item() # print('error', error) if actorder: @@ -153,6 +156,11 @@ def fasterquant( logger.info(f"HT with: sparsity={s_sity}") sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity) losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + if losses[s_sity] > 100: + logger.info(f"{sparsed_new_weight}") + logger.info(f"{new_weight}") + logger.info(f"{sparsed_new_weight.shape}") + logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}") if write: self.layer.weight.data = sparsed_new_weight return losses diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py index 02f6334..4e2911e 100644 --- a/opt_delta_autotune.py +++ b/opt_delta_autotune.py @@ -37,7 +37,6 @@ def hard_threshold(x, fraction_of_zero=0.1): mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) return mask * x - @torch.no_grad() def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2): search_space = { @@ -48,7 +47,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2): compression_rates = {} for wbit in search_space['wbits']: for sparsity in search_space['sparsities']: - compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) if sparsity >=0 else base_floats / wbit + compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True) use_cache = model.config.use_cache @@ -163,7 +162,7 @@ def tmp(_, inp, out): best_wbit = None best_sparsity = None best_loss = None - # starting from the minimal compression rate + # starting from the maximal compression rate # loop through all compression rates: for cr in compression_rates: config = cr[0] @@ -183,7 +182,7 @@ def tmp(_, inp, out): if best_sparsity == -1: best_sparsity = -1 best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss'] - # redo the actual work + # redo the actual work, and write to the layer logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...") tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant( percdamp=args.percdamp, @@ -456,9 +455,9 @@ def main(args): if args.delta: tick = time.time() quantizers, tuned_params = opt_sequential_delta( - original_finetuned_model, model, dataloader, DEV + original_finetuned_model, model, dataloader, DEV, args.tol ) - with open(".cache/tuned_params.json", "w+") as f: + with open(f".cache/{args.model.replace('/', '.')}_delta_tol={args.tol}.json", "w+") as f: json.dump(tuned_params, f) comp_time = time.time() - tick else: @@ -498,9 +497,9 @@ def main(args): if args.save_hf: if args.delta: - hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune" + hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}" else: - hf_path = f"outputs/{args.model.replace('/', '.')}_{args.wbits}bits" + hf_path = f"outputs/{args.model.replace('/', '.')}_autotuned_tol={args.tol}" model.save_pretrained(hf_path) tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer.save_pretrained(hf_path) @@ -534,6 +533,12 @@ def main(args): default="facebook/opt-1.3b", help="base OPT model to load", ) + parser.add_argument( + "--tol", + type=float, + default=0.2, + help="Tolerance of the loss per layer", + ) parser.add_argument( "--seed", type=int, default=0, help="Seed for sampling the calibration data." ) diff --git a/playground.py b/playground.py index 60557c9..3b15c6d 100644 --- a/playground.py +++ b/playground.py @@ -1,20 +1,74 @@ -base_floats = 16 -search_space = { - "wbits": [2,3,4], - "sparsities": [-1, 0.33, 0.5, 0.67, 0.9] -} -compression_rates = {} -for wbit in search_space['wbits']: - for sparsity in search_space['sparsities']: - if sparsity == -1: - sparsity = 0 - compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) +import math +import torch +import numpy as np +import torchvision.transforms as T -compression_rates = sorted(compression_rates.items(), key=lambda x: x[1]) +def bin_array(num, m): + """Convert a positive integer num into an m-bit bit vector""" + return np.array(list(np.binary_repr(num).zfill(m))).astype(np.int8) -for cr in compression_rates: - config = cr[0] - print(config) - wbit = int(config.split('_')[0].split('.')[1]) - sparsity = float(config.split('_')[1].replace('sparsity.','')) - print(f'wbit: {wbit}, sparsity: {sparsity}, compression rate: {cr[1]}') \ No newline at end of file +def hard_threshold(x, fraction_of_zero=0.1): + if fraction_of_zero == 0: + return x + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + transform = T.ToPILImage() + + # convert the tensor to PIL image using above transform + binmask = transform(mask) + binmask = binmask.convert('1') + binmask.save('.io/binmask.bmp') + return mask * x + +def packing_indices(x): + matrix_size = x.shape[0] * x.shape[1] + y = torch.zeros(x.shape) + y = y.flatten() + # find indices of non-zero elements + x = x.clone().flatten() + indices = torch.nonzero(x) + # assume matrix is a power of 2 + bit_width = int(math.log2(matrix_size)) + # turn into a python tensor with boolean values + indices_binary = torch.tensor(np.array([bin_array(i, bit_width) for i in indices])) + packed_indices = torch.tensor(np.packbits(indices_binary, axis=1), dtype=torch.uint8) + return packed_indices + +def unpacking_indices(packed_indices): + # unpack with numpy + unpacked_indices = np.unpackbits(packed_indices, axis=1) + # convert bits back to indices + unpacked_indices = torch.tensor(np.array([int("".join(map(str, i)), 2) for i in unpacked_indices])) + return unpacked_indices + +def compression_rate_calc(msize, wbit, sparsity): + original_bit_used = msize * 16 + nonzeros = msize * sparsity + to_store_value = nonzeros * wbit + to_store_index = nonzeros * math.log2(msize) + print("original_bit_used: ", original_bit_used) + print("to_store_value: ", to_store_value) + print("to_store_index: ", to_store_index) + print("compression rate: ", original_bit_used / (to_store_value + to_store_index)) + return original_bit_used, to_store_value, to_store_index + +if __name__=="__main__": + base_floats = 16 + wbits = 3 + m_size = 2048 + nonsparsity = 0.9 + x = torch.randn((m_size, m_size), dtype=torch.float16) + torch.save(x, ".io/x.pt") + x = hard_threshold(x, nonsparsity) + # 10% x 4M indices -> 800k on disk + packed_indices = packing_indices(x) + print(packed_indices.shape) + print(packed_indices.shape) + print(packed_indices.dtype) + torch.save(packed_indices, ".io/packed_indices.pt") + unpacked_indices = unpacking_indices(packed_indices) + + compression_rate_calc(2048*2048, 3, 0.1) \ No newline at end of file diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh index 0795956..07e84be 100644 --- a/scripts/gptq_delta.sh +++ b/scripts/gptq_delta.sh @@ -1,69 +1,10 @@ -ts -S 8 -CUDA_VISIBLE_DEVICES=0 python opt_delta_autotune.py \ +python opt_delta_autotune.py \ --dataset wikitext2 \ - --wbits 2 \ - --base-model facebook/opt-2.7b \ - --model lnair/opt-2.7b-wikitext2 \ + --base-model facebook/opt-350m \ + --model lnair/opt-350m-wikitext2 \ --delta \ + --wbits 2 \ + --tol 2 \ --save-delta \ --save-hf \ --groupsize 1024 - -# CUDA_VISIBLE_DEVICES=1 python opt_delta.py \ -# --dataset wikitext2 \ -# --wbits 3 \ -# --delta \ -# --sparsify_hard_threshold \ -# --fraction_of_zero 0.95 \ -# --save-hf \ -# --groupsize 1024 & - -# CUDA_VISIBLE_DEVICES=2 python opt_delta.py \ -# --dataset wikitext2 \ -# --wbits 4 \ -# --delta \ -# --sparsify_hard_threshold \ -# --fraction_of_zero 0.95 \ -# --save-hf \ -# --groupsize 1024 & - -# CUDA_VISIBLE_DEVICES=3 python opt_delta.py \ -# --dataset wikitext2 \ -# --wbits 2 \ -# --delta \ -# --sparsify_hard_threshold \ -# --fraction_of_zero 0.99 \ -# --save-hf \ -# --groupsize 1024 & - -# CUDA_VISIBLE_DEVICES=4 python opt_delta.py \ -# --dataset wikitext2 \ -# --wbits 3 \ -# --delta \ -# --sparsify_hard_threshold \ -# --fraction_of_zero 0.99 \ -# --save-hf \ -# --groupsize 1024 & - -# CUDA_VISIBLE_DEVICES=5 python opt_delta.py \ -# --dataset wikitext2 \ -# --wbits 4 \ -# --delta \ -# --sparsify_hard_threshold \ -# --fraction_of_zero 0.99 \ -# --save-hf \ -# --groupsize 1024 & - -# CUDA_VISIBLE_DEVICES=6 python opt_delta.py \ -# --dataset wikitext2 \ -# --wbits 3 \ -# --delta \ -# --save-hf \ -# --groupsize 1024 & - -# CUDA_VISIBLE_DEVICES=7 python opt_delta.py \ -# --dataset wikitext2 \ -# --wbits 4 \ -# --delta \ -# --save-hf \ -# --groupsize 1024 & \ No newline at end of file diff --git a/scripts/playground.ipynb b/scripts/playground.ipynb index 1751996..86c66d6 100644 --- a/scripts/playground.ipynb +++ b/scripts/playground.ipynb @@ -5,12 +5,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import json\n", - "with open(\"../.cache/tuned_params.json\", \"r\") as fp:\n", - " data = json.load(fp)\n", - "print(data.keys())" - ] + "source": [] } ], "metadata": { diff --git a/submit.py b/submit.py index ce84f1f..1830099 100644 --- a/submit.py +++ b/submit.py @@ -1,30 +1,18 @@ import os model_relations = { - # 'facebook/opt-350m': ['lnair/opt-350m-wikitext2'], + #'facebook/opt-350m': ['lnair/opt-350m-wikitext2'], # 'facebook/opt-1.3b': ['lnair/opt-1.3b-wikitext2'], # 'facebook/opt-2.7b': ['lnair/opt-2.7b-wikitext2'], 'facebook/opt-6.7b': ['mit-han-lab/opt-6.7b-smoothquant'], - # 'facebook/opt-13b': ['KoboldAI/OPT-13B-Erebus'], - # 'facebook/opt-30b': ['KoboldAI/OPT-30B-Erebus'] - # 'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b'] + 'facebook/opt-1.3b': ['facebook/opt-iml-1.3b', 'facebook/opt-iml-max-1.3b', 'mit-han-lab/opt-1.3b-smoothquant', 'pszemraj/opt-peter-1.3B', 'opentensor/bt-opt-1.3b'] } -wbits_settings = [2,3,4] +tols = [4.5, 5, 6.5, 8.0] -sparsity_settings = [0, 0.95, 0.99] -os.system("ts -S 8") -for model in model_relations.keys(): - for target_model in model_relations[model]: - for wbits in wbits_settings: - for sparsity in sparsity_settings: - if sparsity == 0: - cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --save-hf --groupsize 1024" - else: - cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --delta --wbits {wbits} --model {target_model} --base-model {model} --sparsify_hard_threshold --fraction_of_zero {sparsity} --save-hf --groupsize 1024" - os.system(cmd) +os.system("ts -S 7") for model in model_relations.keys(): for target_model in model_relations[model]: - for wbits in wbits_settings: - cmd = f"ts --gpus 1 python opt_delta.py --dataset wikitext2 --model {target_model} --base-model {model} --save-hf --groupsize 1024" + for tol in tols: + cmd = f"TS_VISIBLE_DEVICES=0,2,3,4,5,6,7 ts --gpus 1 python opt_delta_autotune.py --dataset wikitext2 --delta --tol {tol} --model {target_model} --base-model {model} --save-hf --groupsize 1024" os.system(cmd) \ No newline at end of file diff --git a/utilities/analyze.py b/utilities/analyze.py new file mode 100644 index 0000000..57c69d1 --- /dev/null +++ b/utilities/analyze.py @@ -0,0 +1,16 @@ +import json +import matplotlib.pyplot as plt + +with open(".cache/lnair.opt-350m-wikitext2_delta_tol=2.0.json", "r") as fp: + data = json.load(fp) + +all_best_losses = [] +for layer_name in data.keys(): + best_loss = data[layer_name]['choice']['best_loss'] + all_best_losses.append(best_loss) + if (best_loss > 100): + print(f"{layer_name} large loss!") +print(all_best_losses) +# plot a histogram of the best losses +plt.hist(all_best_losses, bins=100) +plt.savefig('.cache/lnair.opt-350m-wikitext2_delta_tol=2.0.png') \ No newline at end of file diff --git a/utilities/compression_rate_estimator.py b/utilities/compression_rate_estimator.py new file mode 100644 index 0000000..90adc48 --- /dev/null +++ b/utilities/compression_rate_estimator.py @@ -0,0 +1,53 @@ +import json +import math +from modelutils import get_opt, find_layers +from compression_scripts.model_utils import get_opt, find_layers + + +base_floats = 16 + + +base_floats = 16 + +def calc_compression(path: str, base_model: str): + base_model = get_opt(base_model) + with open(path, "r") as f: + data = json.load(f) + + base_layers = base_model.model.decoder.layers + + total_original_bits = 0 + total_used_bits = 0 + sparsity_lists = [] + total_stats = {} + for i in range(len(base_layers)): + layer = base_layers[i] + subset = find_layers(layer) + for name in subset: + original_weight = subset[name].weight.data + original_weight_count = original_weight.numel() + total_original_bits += original_weight_count * base_floats + if f"{i}_{name}" in data: + config = data[f"{i}_{name}"]["choice"] + # save them as indices + values pair + nonzeros = (1-config["best_sparsity"]) * original_weight_count + # to store values + used_bits = nonzeros * config["best_wbit"] + # to store indices + used_bits += nonzeros * 2 * math.log2(original_weight_count) * 8 + + sparsity_lists.append(config["best_sparsity"]) + + total_used_bits += used_bits + else: + raise ValueError(f"Layer {i}_{name} not found in {path}") + + total_stats['compresion_rate'] = total_original_bits / total_used_bits + total_stats['sparsity'] = sum(sparsity_lists) / len(sparsity_lists) + return total_stats + +if __name__=="__main__": + path = ".cache/lnair.opt-1.3b-wikitext2_delta_tol=1.0.json" + base_model = "facebook/opt-1.3b" + stats = calc_compression(path, base_model) + print(stats) \ No newline at end of file diff --git a/utilities/tuning_analyser.py b/utilities/tuning_analyser.py new file mode 100644 index 0000000..a7a658b --- /dev/null +++ b/utilities/tuning_analyser.py @@ -0,0 +1,6 @@ +import json +path = ".cache/lnair.opt-350m-wikitext2_delta_tol=0.2.json" + +with open(path, "r") as fp: + data = json.load(fp) + From 07739c4dbd73e8cff9926b2965d239256463b4aa Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Sun, 14 May 2023 02:13:08 +0000 Subject: [PATCH 21/23] updates on autotuning --- autotune_gptq.py | 175 +++++++++++ compress_utils.py | 1 + gptj_delta_autotuned.py | 624 ++++++++++++++++++++++++++++++++++++++++ gptq.py | 15 +- opt_delta.py | 1 - opt_delta_autotune.py | 138 +++------ pack_utils_test.py | 36 +-- quant.py | 10 +- tensorio.py | 3 +- 9 files changed, 865 insertions(+), 138 deletions(-) create mode 100644 autotune_gptq.py create mode 100644 gptj_delta_autotuned.py diff --git a/autotune_gptq.py b/autotune_gptq.py new file mode 100644 index 0000000..34e987d --- /dev/null +++ b/autotune_gptq.py @@ -0,0 +1,175 @@ +import math +import time +import torch +import transformers +import torch.nn as nn +from loguru import logger +from quant import quantize + +DEBUG = False + +torch.backends.cuda.matmul.allow_tf32 = False +torch.backends.cudnn.allow_tf32 = False + +def hard_threshold(x, fraction_of_zero=0.1): + if fraction_of_zero == 0: + return x + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor) + return mask * x + +class GPTQ: + def __init__(self, layer): + self.layer = layer + self.original_weight = layer.weight.data.clone() + self.dev = self.layer.weight.device + W = layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + self.rows = W.shape[0] + self.columns = W.shape[1] + self.H = torch.zeros((self.columns, self.columns), device=self.dev) + self.nsamples = 0 + + def add_batch(self, inp, out): + self.inp1 = inp + self.out1 = out + if len(inp.shape) == 2: + inp = inp.unsqueeze(0) + tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear): + if len(inp.shape) == 3: + inp = inp.reshape((-1, inp.shape[-1])) + inp = inp.t() + + self.H *= self.nsamples / (self.nsamples + tmp) + self.nsamples += tmp + # inp = inp.float() + inp = math.sqrt(2 / self.nsamples) * inp.float() + # self.H += 2 / self.nsamples * inp.matmul(inp.t()) + self.H += inp.matmul(inp.t()) + + def fasterquant( + self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, write=True, sparsity=None + ): + W = self.layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + W = W.float() + + tick = time.time() + + if not self.quantizer.ready(): + self.quantizer.find_params(W, weight=True) + + H = self.H + if write: + del self.H + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + W[:, dead] = 0 + + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + + Losses = torch.zeros_like(W) + Q = torch.zeros_like(W) + + damp = percdamp * torch.mean(torch.diag(H)) + diag = torch.arange(self.columns, device=self.dev) + H[diag, diag] += damp + H = torch.linalg.cholesky(H) + H = torch.cholesky_inverse(H) + H = torch.linalg.cholesky(H, upper=True) + Hinv = H + + for i1 in range(0, self.columns, blocksize): + i2 = min(i1 + blocksize, self.columns) + count = i2 - i1 + + W1 = W[:, i1:i2].clone() + Q1 = torch.zeros_like(W1) + Err1 = torch.zeros_like(W1) + Losses1 = torch.zeros_like(W1) + Hinv1 = Hinv[i1:i2, i1:i2] + + for i in range(count): + w = W1[:, i] + d = Hinv1[i, i] + + if groupsize != -1: + if (i1 + i) % groupsize == 0: + self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True) + + q = quantize( + w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq + ).flatten() + Q1[:, i] = q + Losses1[:, i] = (w - q) ** 2 / d ** 2 + + err1 = (w - q) / d + W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + Err1[:, i] = err1 + + Q[:, i1:i2] = Q1 + Losses[:, i1:i2] = Losses1 / 2 + + W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) + + if DEBUG: + pass + #self.layer.weight.data[:, :i2] = Q[:, :i2] + #self.layer.weight.data[:, i2:] = W[:, i2:] + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + #print(torch.sum(Losses)) + + torch.cuda.synchronize() + total_time = time.time() - tick + # print('time %.2f' % total_time) + # error = torch.sum(Losses).item() + # print('error', error) + + if actorder: + invperm = torch.argsort(perm) + Q = Q[:, invperm] + + if isinstance(self.layer, transformers.Conv1D): + Q = Q.t() + # here report the loss of the quantized layer vs. the original layer + new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype) + losses = {} + if sparsity is None: + sparsed_new_weight = new_weight + losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + else: + for s_sity in sparsity: + if write: + logger.info(f"HT with: sparsity={s_sity}") + sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity) + losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) + if losses[s_sity] > 100: + logger.info(f"{sparsed_new_weight}") + logger.info(f"{new_weight}") + logger.info(f"{sparsed_new_weight.shape}") + logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}") + if write: + self.layer.weight.data = sparsed_new_weight + return losses + + def free(self): + if DEBUG: + self.inp1 = None + self.out1 = None + self.H = None + self.Losses = None + self.Trace = None + torch.cuda.empty_cache() \ No newline at end of file diff --git a/compress_utils.py b/compress_utils.py index 20e20f1..341704f 100644 --- a/compress_utils.py +++ b/compress_utils.py @@ -4,6 +4,7 @@ import numpy as np from torch.utils.dlpack import to_dlpack, from_dlpack from quant import Quantizer + def cupy_to_tensor(x): return from_dlpack(x.toDlpack()) diff --git a/gptj_delta_autotuned.py b/gptj_delta_autotuned.py new file mode 100644 index 0000000..bd883b9 --- /dev/null +++ b/gptj_delta_autotuned.py @@ -0,0 +1,624 @@ + +import time +import math + +import torch +import torch.nn as nn +import transformers + +from gptq import * +from modelutils import * +from quant import * +import os +import copy + +def get_gptj(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import GPTJForCausalLM + model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + model.seqlen = model.config.max_position_embeddings + print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad)) + return model + +@torch.no_grad() +def gptj_sequential(model, dataloader, dev, means=None, stds=None): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + #print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_sequential_delta(model, delta_model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.transformer.h + delta_layers = delta_model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = original_outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + # print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache ['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) + try: + # print(batch.shape) + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + model.transformer.ln_f = model.transformer.ln_f.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + hidden_states = model.transformer.ln_f(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + + model.config.use_cache = use_cache + +def gptj_pack(model, quantizers, wbits, groupsize): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) + print('Packing ...') + for name in qlayers: + print(name) + quantizers[name],scale,zero = quantizers[name] + quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() + qlayers[name].pack(layers[name], scale, zero) + print('Done!') + return model + +def load_quant(model, checkpoint, wbits, groupsize): + from transformers import GPTJConfig, GPTJForCausalLM + config = GPTJConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = GPTJForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done!') + + return model + +def gptj_multigpu(model, gpus): + model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) + if hasattr(model.model, 'norm') and model.model.norm: + model.model.norm = model.model.norm.to(gpus[-1]) + import copy + model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) + + cache = {'mask': None} + + class MoveModule(nn.Module): + def __init__(self, module): + super().__init__() + self_module = module + self.dev = next(iter(self.module.parameters())).device + def forward(self, *inp, **kwargs): + inp = list(inp) + if inp[0].device != self.dev: + inp[0] = inp[0].to(self.dev) + if cache['mask'] is None or cache ['mask'].device != self.dev: + cache['mask'] = kwargs['attention_mask'].to(self.dev) + kwargs['attention_mask'] = cache['mask'] + tmp = self.module(*inp, **kwargs) + return tmp + + layers = model.model.layers + pergpu = math.ceil(len(layers) / len(gpus)) + for i in range(len(layers)): + layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) + + model.gpus = gpus + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) + torch.cuda.synchronize() + + cache = {'past': None} + def clear_past(i): + def tmp(layer, inp, out): + if cache['past']: + cache['past'][i] = None + return tmp + for i, layer in enumerate(model.model.layers): + layer.register_forward_hook(clear_past(i)) + + print('Benchmarking ...') + + if check: + loss = nn.CrossEntropyLoss() + tot = 0. + + def sync(): + if hasattr(model, 'gpus'): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + max_memory = 0 + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + + out = model( + input_ids[:, i:i+1], + past_key_values=cache['past'], + attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024) + if check and i != input_ids.numel() - 1: + tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() + cache['past'] = list(out.past_keys_values) + del out + sync() + import numpy as np + print('Median:', np.median(times)) + if check: + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) + + + +def main(args): + print(args) + num_params_saved_lr = 0 + num_params = 0 + if args.load: + model = load_quant3(args.model, args.load) + else: + if args.delta and args.wbits<16: + model = get_gptj(args.model) + model.eval() + base_model = get_gptj(args.base_model) + base_model.eval() + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + original_finetuned_model = copy.deepcopy(model) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data-base_p.data).clone() + else: + model = get_gptj(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + if args.delta: + tick = time.time() + quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV) + + comp_time = time.time()-tick + else: + quantizers = gptj_sequential(model, dataloader, DEV) + + if args.delta and args.wbits<16: + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + if args.sparsify_hard_threshold: + print('Hard Thresholding...') + W = finetuned_p.data + finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + if args.rank>0 and len(finetuned_p.shape) == 2: + print('Finding Low Rank Approximation...') + A = finetuned_p.data.float() + U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) + A = U @ torch.diag_embed(S) @ Vh.T + finetuned_p.data = A.half() + num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) + num_params += torch.numel(finetuned_p.data) + finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + gptj_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + dataset = args.dataset + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + ppl = gptj_eval(model, testloader, DEV) + print(ppl) + + if args.rank > 0: + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print("Number of params without low rank ", n_params) + print("Number of params with low rank", n_params - num_params_saved_lr) + if args.save: + gptj_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) + return ppl + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model', type=str, default='togethercomputer/GPT-JT-6B-v1', + help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.' + ) + parser.add_argument( + '--base_model', type=str, default='EleutherAI/gpt-j-6b', + help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' + ) + parser.add_argument( + '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--save', type=str, default='', + help='Save the quantized GPT-J model under this name.' + ) + parser.add_argument( + '--save_safetensors', type=str, default='', + help='Save the quantized GPT-J model as a `.safetensors` ckpt' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load the quantized GPT-J model' + ) + parser.add_argument( + '--benchmark', type=int, default=0, + help='Number of tokens to use for benchmarking.' + ) + parser.add_argument( + '--check', action='store_true', + help='Whether to compute perpexity during benchmarking for verification.' + ) + parser.add_argument( + '--delta', action='store_true', + help='Whether to use delta compression' + ) + parser.add_argument( + '--sparsify_hard_threshold', action='store_true', + help='Whether to add sparsity' + ) + parser.add_argument( + '--fraction_of_zero', type=float, default=0.99, + help='Sparsity ratio' + ) + parser.add_argument( + '--benchmark_results', type=str, default='', + help='store benchmark results' + ) + parser.add_argument( + '--sym', action='store_true', default=True, + help='Whether to use symmetric quantization' + ) + parser.add_argument( + '--trits', action='store_true', default=False, + help='Whether to use trits' + ) + parser.add_argument('--act_order', type=str, default=False) + + args = parser.parse_args() + + results = PrettyTable() + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [4, 3, 2]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(results) + with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + f.write(str(results)) + print('finished.') \ No newline at end of file diff --git a/gptq.py b/gptq.py index 34e987d..80ea2d1 100644 --- a/gptq.py +++ b/gptq.py @@ -13,13 +13,13 @@ def hard_threshold(x, fraction_of_zero=0.1): if fraction_of_zero == 0: - return x + return x, None y, _ = torch.sort(x.view(-1).abs().clone()) num_params = torch.numel(x) thresh_index = int(num_params * fraction_of_zero) threshold = y[thresh_index] mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor) - return mask * x + return mask * x, mask class GPTQ: def __init__(self, layer): @@ -147,23 +147,20 @@ def fasterquant( # here report the loss of the quantized layer vs. the original layer new_weight = Q.reshape(self.layer.weight.shape).to(self.layer.weight.dtype) losses = {} + mask = None if sparsity is None: sparsed_new_weight = new_weight losses[0] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) else: for s_sity in sparsity: + sparsed_new_weight, mask = hard_threshold(new_weight, fraction_of_zero=s_sity) if write: logger.info(f"HT with: sparsity={s_sity}") - sparsed_new_weight = hard_threshold(new_weight, fraction_of_zero=s_sity) losses[s_sity] = torch.sum((self.inp1 @ (sparsed_new_weight.T) - self.out1) ** 2) - if losses[s_sity] > 100: - logger.info(f"{sparsed_new_weight}") - logger.info(f"{new_weight}") - logger.info(f"{sparsed_new_weight.shape}") - logger.info(f"{torch.max(torch.abs(self.inp1 @ (sparsed_new_weight.T) - self.out1))}") + if write: self.layer.weight.data = sparsed_new_weight - return losses + return losses, mask def free(self): if DEBUG: diff --git a/opt_delta.py b/opt_delta.py index 38e3547..10a34d3 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -473,7 +473,6 @@ def sync(): def main(args): print(args) - tensor_io = TensorIO('sparse') num_params_saved_lr = 0 num_params = 0 if args.load: diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py index 4e2911e..ad49111 100644 --- a/opt_delta_autotune.py +++ b/opt_delta_autotune.py @@ -1,3 +1,4 @@ +import os import copy import time import json @@ -10,6 +11,7 @@ from loguru import logger from tensorio import TensorIO, model_packing from transformers import AutoTokenizer, AutoModel +import torchvision.transforms as T # from prettytable import PrettyTable def get_opt(model): @@ -28,15 +30,6 @@ def skip(*args, **kwargs): model.seqlen = model.config.max_position_embeddings return model - -def hard_threshold(x, fraction_of_zero=0.1): - y, _ = torch.sort(x.view(-1).abs().clone()) - num_params = torch.numel(x) - thresh_index = int(num_params * fraction_of_zero) - threshold = y[thresh_index] - mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) - return mask * x - @torch.no_grad() def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2): search_space = { @@ -45,10 +38,15 @@ def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2): } base_floats = 16 compression_rates = {} + masks = {} for wbit in search_space['wbits']: for sparsity in search_space['sparsities']: compression_rates[f'wbit.{wbit}_sparsity.{sparsity}'] = (base_floats / wbit) / (1 - sparsity) - compression_rates = sorted(compression_rates.items(), key=lambda x: x[1], reverse=True) + compression_rates = sorted( + compression_rates.items(), + key=lambda x: x[1], + reverse=True + ) use_cache = model.config.use_cache model.config.use_cache = False @@ -146,7 +144,7 @@ def tmp(_, inp, out): for name in subset: logger.info(f"Quantizing {i}.{name} ...") for wbit in search_space['wbits']: - losses=tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant( + losses, _ =tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant( percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, @@ -184,13 +182,16 @@ def tmp(_, inp, out): best_loss = tuned_configs[f'{i}_{name}'][f'wbit.{best_wbit}_sparsity.{best_sparsity}']['loss'] # redo the actual work, and write to the layer logger.info(f"Applying wbit={best_wbit}, sparsity={best_sparsity} ...") - tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant( + loss, mask = tuned_params[f'{i}_{name}'][f'wbit.{best_wbit}']['gptq'].fasterquant( percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, write=True, sparsity = [best_sparsity], ) + if mask is not None: + masks[f'{i}_{name}'] = mask + quantizers["model.decoder.layers.%d.%s" % (i, name)] = tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].quantizer tuned_params[f"{i}_{name}"][f'wbit.{best_wbit}']['gptq'].free() tuned_configs[f'{i}_{name}']['choice'] = { @@ -216,7 +217,7 @@ def tmp(_, inp, out): model.config.use_cache = use_cache - return quantizers, tuned_configs + return quantizers, tuned_configs, masks @torch.no_grad() def opt_eval(model, testenc, dev): @@ -316,53 +317,6 @@ def forward(self, inp, **kwargs): model.config.use_cache = use_cache return ppl.item() - -# TODO: perform packing on GPU -def opt_pack3(model, quantizers): - layers = find_layers(model) - layers = {n: layers[n] for n in quantizers} - make_quant3(model, quantizers, faster=args.faster_kernel) - qlayers = find_layers(model, [Quant3Linear]) - print("Packing ...") - for name in qlayers: - print(name) - quantizers[name] = quantizers[name].cpu() - qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero) - print("Done.") - return model - - -def load_quant3(model, checkpoint): - from transformers import OPTConfig, OPTForCausalLM - - config = OPTConfig.from_pretrained(model) - - def noop(*args, **kwargs): - pass - - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop - - torch.set_default_dtype(torch.half) - transformers.modeling_utils._init_weights = False - torch.set_default_dtype(torch.half) - model = OPTForCausalLM(config) - torch.set_default_dtype(torch.float) - model = model.eval() - layers = find_layers(model) - for name in ["model.decoder.project_out", "model.decoder.project_in", "lm_head"]: - if name in layers: - del layers[name] - make_quant3(model, layers, faster=args.faster_kernel) - - print("Loading model ...") - model.load_state_dict(torch.load(checkpoint)) - model.seqlen = model.config.max_position_embeddings - print("Done.") - - return model - def benchmark(model, input_ids, check=False): input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else DEV) torch.cuda.synchronize() @@ -421,47 +375,45 @@ def sync(): def main(args): print(args) num_params = 0 - if args.load: - model = load_quant3(args.model, args.load) + if args.delta and args.wbits < 16: + model = get_opt(args.model) + model.eval() + base_model = get_opt(args.base_model) + base_model.eval() + dataloader, testloader = get_loaders( + args.dataset, + nsamples=args.nsamples, + seed=args.seed, + model=args.model, + seqlen=model.seqlen, + ) + original_finetuned_model = copy.deepcopy(model) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data - base_p.data).clone() else: - if args.delta and args.wbits < 16: - model = get_opt(args.model) - model.eval() - base_model = get_opt(args.base_model) - base_model.eval() - dataloader, testloader = get_loaders( - args.dataset, - nsamples=args.nsamples, - seed=args.seed, - model=args.model, - seqlen=model.seqlen, - ) - original_finetuned_model = copy.deepcopy(model) - for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): - finetuned_p.data = (finetuned_p.data - base_p.data).clone() - else: - model = get_opt(args.model) - model.eval() - - dataloader, testloader = get_loaders( - args.dataset, - nsamples=args.nsamples, - seed=args.seed, - model=args.model, - seqlen=model.seqlen, - ) + model = get_opt(args.model) + model.eval() if args.wbits < 16: if args.delta: tick = time.time() - quantizers, tuned_params = opt_sequential_delta( + quantizers, tuned_params, masks = opt_sequential_delta( original_finetuned_model, model, dataloader, DEV, args.tol ) - with open(f".cache/{args.model.replace('/', '.')}_delta_tol={args.tol}.json", "w+") as f: + data_dir = os.path.join(".cache", args.model.replace('/', '.')) + os.makedirs(data_dir, exist_ok=True) + with open(f".cache/{args.model.replace('/', '.')}/delta_tol={args.tol}_tuned_params.json", "w+") as f: json.dump(tuned_params, f) - comp_time = time.time() - tick + # iterate over all the dict keys in masks + transforms = T.ToPILImage() + for key in masks.keys(): + logger.info(f"Saving mask for {key}") + binmask = transforms(masks[key]) + binmask = binmask.convert("1") + binmask.save(os.path.join(data_dir, f"delta_tol={args.tol}_mask_{key}.bmp")) else: raise NotImplementedError + if args.delta and args.wbits < 16: for idx, (base_p, finetuned_p) in enumerate( zip(base_model.parameters(), model.parameters()) @@ -503,10 +455,6 @@ def main(args): model.save_pretrained(hf_path) tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer.save_pretrained(hf_path) - else: - opt_pack3(model, quantizers) - torch.save(model.state_dict(), args.save) - if __name__ == "__main__": import argparse diff --git a/pack_utils_test.py b/pack_utils_test.py index 58f1ad0..efc7963 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -1,25 +1,14 @@ import torch -from quant import quantize, Quantizer -from safetensors import safe_open -from pack_utils import SparseTensor, pack_to_bits, unpack_from_bits -from safetensors.torch import save_file +from quant import Quantizer from opt_delta import hard_threshold +from safetensors.torch import save_file QUANTIZED_BITS = 4 if __name__=="__main__": - """ - The process: - 1. Given a weight, quantize it first - 2. Then do sparsification - - To test our pack/unpack, we need to do the following: - 1. After the sparsification, we pack the weight and store on disk - 2. Compare the original weight with the unpacked weight - """ - torch.set_printoptions(precision=4) - b = torch.rand((2048, 2048), dtype=torch.float32) + b = torch.rand((1, 1), dtype=torch.float32) + print(b) # save b save_file({'wb1': b}, '.cache/original_b.safetensor') quantizer = Quantizer() @@ -28,18 +17,5 @@ ) quantizer.find_params(b, weight=True) b_q = quantizer.quantize(b) - sparsed_b_q = hard_threshold(b_q, 0.01) - - q_weight = pack_to_bits(sparsed_b_q, quantizer, QUANTIZED_BITS, groupsize=sparsed_b_q.shape[0]) - sparse_t = SparseTensor(q_weight, 'wb1', minifloats=-1) - sparse_t.to_disk('.cache/sparse_b.safetensor') - # now load it back - restored_sparse_t = SparseTensor.from_disk('.cache/sparse_b.safetensor') - restored_weight = restored_sparse_t.tensor - # this is what we restored from disk - restored_weight = unpack_from_bits(restored_weight, quantizer, QUANTIZED_BITS, groupsize=b_q.shape[0]) - print(f"Original weight: {sparsed_b_q}") - print(f"Restored weight: {restored_weight}") - # count the number of non-zero elements - print(f"Original weight: {sparsed_b_q.nonzero().shape[0]}") - print(f"Restored weight: {restored_weight.nonzero().shape[0]}") \ No newline at end of file + print(b_q) + \ No newline at end of file diff --git a/quant.py b/quant.py index f23099a..f4a4983 100644 --- a/quant.py +++ b/quant.py @@ -4,6 +4,9 @@ import torch.nn as nn def quantize(x, scale, zero, maxq): + print(scale) + print(zero) + print(maxq) if maxq < 0: return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) @@ -83,7 +86,12 @@ def find_params(self, x, weight=False): xmax1 = p * xmax scale1 = (xmax1 - xmin1) / self.maxq zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero - q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq) + q = quantize( + x, + scale1.unsqueeze(1), + zero1.unsqueeze(1), + self.maxq + ) q -= x q.abs_() q.pow_(self.norm) diff --git a/tensorio.py b/tensorio.py index 46cdd2f..c5526f8 100644 --- a/tensorio.py +++ b/tensorio.py @@ -52,5 +52,4 @@ def model_packing(model, quantizers, bits, reformat='none'): if name in quantizers: quantizers[name] = quantizers[name].cpu() x, scale = compress_flexible_nbits(layers[name].weight.data.cuda(), bits) - return x, scale - + return x, scale \ No newline at end of file From db0dc5193dc03bc58a89f906dae88cb3bc29f030 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Mon, 15 May 2023 16:44:32 +0000 Subject: [PATCH 22/23] add datautils for generic jsonl --- .gitignore | 3 +- compress_utils.py | 1 - datautils.py | 65 +++++++++++- gptq.py | 11 +- opt_delta_autotune.py | 8 +- opt_eval_ppl.py | 128 ++++++++++++++++++++++++ quant.py | 5 +- scripts/opt_delta_exp.sh | 10 ++ utilities/compression_rate_estimator.py | 4 - utilities/convert_to_hf.py | 6 +- utilities/cr_cal.py | 0 utilities/to_csv.py | 19 ++++ 12 files changed, 237 insertions(+), 23 deletions(-) create mode 100644 opt_eval_ppl.py create mode 100644 scripts/opt_delta_exp.sh create mode 100644 utilities/cr_cal.py create mode 100644 utilities/to_csv.py diff --git a/.gitignore b/.gitignore index f73b20e..73ab46b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ outputs_past/ packed_delta .cache delta_outputs/ -.io/ \ No newline at end of file +.io/ +outputs_exp/ \ No newline at end of file diff --git a/compress_utils.py b/compress_utils.py index 341704f..143f017 100644 --- a/compress_utils.py +++ b/compress_utils.py @@ -283,7 +283,6 @@ def _compress_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512, return x, scale - def compress_flexible_nbits_by_bucket(x, bits, scale_method='max', bucket_size=512, stochastic=False, minimum_stochastic_distance=0.2): # support any bits diff --git a/datautils.py b/datautils.py index 045121a..51003f6 100644 --- a/datautils.py +++ b/datautils.py @@ -1,5 +1,8 @@ -import numpy as np +import json import torch +import random +import numpy as np +from transformers import AutoTokenizer def set_seed(seed): np.random.seed(seed) @@ -157,6 +160,46 @@ def __init__(self, input_ids): return trainloader, valenc +def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256): + """ + train_path: path to train jsonl file + test_path: path to test jsonl file + """ + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) + with open(train_path, 'r') as f: + traindata = [json.loads(line) for line in f.readlines()] + with open(val_path, 'r') as f: + valdata = [json.loads(line) for line in f.readlines()] + traindata = {"text": [d['text'] for d in traindata]} + testdata = {"text": [d['text'] for d in testdata]} + set_seed(seed) + + trainloader = [] + for _ in range(n_samples): + # for all datasets, we take the samples that are longer than seq_len + while True: + i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if trainenc.input_ids.shape[1] >= seq_len: + break + # then clip the samples to seq_len + i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1) + j = i + seq_len + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + if val_size is not None: + valenc = tokenizer(' '.join(valdata[:val_size]['text']), return_tensors='pt') + else: + valenc = tokenizer(' '.join(valdata['text']), return_tensors='pt') + valenc = valenc.input_ids[:, :(val_seq_len * seq_len)] + + class TokenizerWrapper: + def __init__(self, input_ids): + self.input_ids = input_ids + valenc = TokenizerWrapper(valenc) + return trainloader, valenc def get_loaders( name, nsamples=128, seed=0, seqlen=2048, model='' @@ -171,3 +214,23 @@ def get_loaders( if 'new' in name: return get_c4_new(nsamples, seed, seqlen, model) return get_c4(nsamples, seed, seqlen, model) + if name == "answer_verification": + return get_jsonl(".cache/ni_calib/train/answer_verification.jsonl", ".cache/ni_calib/val/answer_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "coherence_classification": + return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "commonsense_classification": + return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "dialogue_state_tracking": + return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "fact_verification": + return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "gender_classification": + return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "irony_detection": + return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "stance_detection": + return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "toxic_language_detection": + return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000) + if name == "word_semantics": + return get_jsonl(".cache/ni_calib/train/word_semantics.jsonl", ".cache/ni_calib/test/word_semantics.jsonl", nsamples, seed, seqlen, model, val_size=1000) \ No newline at end of file diff --git a/gptq.py b/gptq.py index 80ea2d1..87dd8cf 100644 --- a/gptq.py +++ b/gptq.py @@ -11,12 +11,18 @@ torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False -def hard_threshold(x, fraction_of_zero=0.1): +def hard_threshold(x, fraction_of_zero=0.1, random_sparsification=0.5): if fraction_of_zero == 0: return x, None + # randomly set random_sparsification of the weights to zero + if random_sparsification > 0: + logger.info(f"Randomly sparsifying the weights with {random_sparsification}") + mask = torch.rand(x.shape, device=x.device) > random_sparsification + x = x * mask y, _ = torch.sort(x.view(-1).abs().clone()) num_params = torch.numel(x) - thresh_index = int(num_params * fraction_of_zero) + + thresh_index = int(num_params * fraction_of_zero * (1/random_sparsification)) threshold = y[thresh_index] mask = x.abs().clone().gt(threshold).type(torch.cuda.HalfTensor) return mask * x, mask @@ -46,7 +52,6 @@ def add_batch(self, inp, out): if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() - self.H *= self.nsamples / (self.nsamples + tmp) self.nsamples += tmp # inp = inp.float() diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py index ad49111..ac88b37 100644 --- a/opt_delta_autotune.py +++ b/opt_delta_autotune.py @@ -47,7 +47,7 @@ def opt_sequential_delta(model, delta_model, dataloader, dev, tol=0.2): key=lambda x: x[1], reverse=True ) - + use_cache = model.config.use_cache model.config.use_cache = False layers = model.model.decoder.layers @@ -144,7 +144,7 @@ def tmp(_, inp, out): for name in subset: logger.info(f"Quantizing {i}.{name} ...") for wbit in search_space['wbits']: - losses, _ =tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant( + losses, _ = tuned_params[f'{i}_{name}'][f'wbit.{wbit}']['gptq'].fasterquant( percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, @@ -449,9 +449,9 @@ def main(args): if args.save_hf: if args.delta: - hf_path = f"outputs/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}" + hf_path = f"outputs_exp/{args.model.replace('/', '.')}_delta_autotune_tol={args.tol}" else: - hf_path = f"outputs/{args.model.replace('/', '.')}_autotuned_tol={args.tol}" + hf_path = f"outputs_exp/{args.model.replace('/', '.')}_autotuned_tol={args.tol}" model.save_pretrained(hf_path) tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer.save_pretrained(hf_path) diff --git a/opt_eval_ppl.py b/opt_eval_ppl.py new file mode 100644 index 0000000..bf41c5d --- /dev/null +++ b/opt_eval_ppl.py @@ -0,0 +1,128 @@ +import os +import json +import torch +import torch.nn as nn +from modelutils import get_opt +from datautils import get_loaders + +BENCHMARK = 2048 + +dataset = 'wikitext2' + +nsamples = 128 + +@torch.no_grad() +def opt_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.decoder.final_layer_norm is not None: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) + if model.model.decoder.project_out is not None: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.decoder.final_layer_norm is not None: + hidden_states = model.model.decoder.final_layer_norm(hidden_states) + if model.model.decoder.project_out is not None: + hidden_states = model.model.decoder.project_out(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + model.config.use_cache = use_cache + return ppl.item() + +models = os.listdir("outputs") +res = {} +models = [ + # 'facebook/opt-1.3b', + # 'facebook/opt-350m', + 'facebook/opt-2.7b', + # 'lnair/opt-350m-wikitext2', + # 'lnair/opt-1.3b-wikitext2', + 'lnair/opt-2.7b-wikitext2' +] +for model_name in models: + # model_path = os.path.join("outputs", model_name) + model = get_opt(model_name) + model.to("cuda") + _, testloader = get_loaders( + dataset, nsamples=128, seed=0, model=model_name, seqlen=model.seqlen + ) + ppl = opt_eval(model, testloader, model.device) + res[model_name] = ppl + print(res) + with open("ppl_res.json", "w") as f: + json.dump(res, f) \ No newline at end of file diff --git a/quant.py b/quant.py index f4a4983..386845c 100644 --- a/quant.py +++ b/quant.py @@ -1,12 +1,9 @@ import math -import numpy as np import torch +import numpy as np import torch.nn as nn def quantize(x, scale, zero, maxq): - print(scale) - print(zero) - print(maxq) if maxq < 0: return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) diff --git a/scripts/opt_delta_exp.sh b/scripts/opt_delta_exp.sh new file mode 100644 index 0000000..2b41218 --- /dev/null +++ b/scripts/opt_delta_exp.sh @@ -0,0 +1,10 @@ +python opt_delta_autotune.py \ + --dataset wikitext2 \ + --base-model facebook/opt-1.3b \ + --model lnair/opt-1.3b-wikitext2 \ + --delta \ + --wbits 2 \ + --tol 2 \ + --save-delta \ + --save-hf \ + --groupsize 1024 \ No newline at end of file diff --git a/utilities/compression_rate_estimator.py b/utilities/compression_rate_estimator.py index 90adc48..2b8a564 100644 --- a/utilities/compression_rate_estimator.py +++ b/utilities/compression_rate_estimator.py @@ -3,10 +3,6 @@ from modelutils import get_opt, find_layers from compression_scripts.model_utils import get_opt, find_layers - -base_floats = 16 - - base_floats = 16 def calc_compression(path: str, base_model: str): diff --git a/utilities/convert_to_hf.py b/utilities/convert_to_hf.py index 8111506..36f7262 100644 --- a/utilities/convert_to_hf.py +++ b/utilities/convert_to_hf.py @@ -1,16 +1,12 @@ +import os import torch import torch.nn as nn from transformers import GPTJForCausalLM - from transformers import AutoConfig, AutoTokenizer - from transformers.modeling_utils import no_init_weights -import os - def create_emtpy_gptj(config): - import torch import torch.nn as nn diff --git a/utilities/cr_cal.py b/utilities/cr_cal.py new file mode 100644 index 0000000..e69de29 diff --git a/utilities/to_csv.py b/utilities/to_csv.py new file mode 100644 index 0000000..db809a2 --- /dev/null +++ b/utilities/to_csv.py @@ -0,0 +1,19 @@ +import json +import pandas as pd +with open('ppl_res.json') as f: + res = json.load(f) +# convert to csv +sizes_group = ['350m', '1.3b', '2.7b'] +results = [] +for key in res.keys(): + results.append({ + 'model': key, + 'perplexity': res[key], + }) +df = pd.DataFrame(results) + +# pivot table such that columns is different models, rows is different perplexity +for size in sizes_group: + subdf = df[df['model'].str.contains(size)] + subdf = subdf.pivot_table(values='perplexity', columns='model') + subdf.to_csv(f'ppl_res_{size}.csv', index=False) \ No newline at end of file From 5d2c0861b1c87ce2f12cd428080427ce31fd5cb1 Mon Sep 17 00:00:00 2001 From: Xiaozhe Yao Date: Mon, 15 May 2023 21:59:06 +0000 Subject: [PATCH 23/23] generic data loader --- datautils.py | 70 ++++++++++++++++++++++++++++++------------- opt_delta_autotune.py | 2 -- opt_eval_ppl.py | 29 +++++++++--------- pack_utils_test.py | 1 - ppl_res.json | 1 + scripts/gptq_delta.sh | 6 ++-- 6 files changed, 69 insertions(+), 40 deletions(-) create mode 100644 ppl_res.json diff --git a/datautils.py b/datautils.py index 51003f6..71a8616 100644 --- a/datautils.py +++ b/datautils.py @@ -2,6 +2,7 @@ import torch import random import numpy as np +from datasets import Dataset from transformers import AutoTokenizer def set_seed(seed): @@ -160,7 +161,7 @@ def __init__(self, input_ids): return trainloader, valenc -def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256): +def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_size=None, val_seq_len=256, padding=False): """ train_path: path to train jsonl file test_path: path to test jsonl file @@ -171,7 +172,9 @@ def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_si with open(val_path, 'r') as f: valdata = [json.loads(line) for line in f.readlines()] traindata = {"text": [d['text'] for d in traindata]} - testdata = {"text": [d['text'] for d in testdata]} + valdata = {"text": [d['text'] for d in valdata]} + traindata = Dataset.from_dict(traindata) + valdata = Dataset.from_dict(valdata) set_seed(seed) trainloader = [] @@ -179,16 +182,25 @@ def get_jsonl(train_path, val_path, n_samples, seed, seq_len, model_name, val_si # for all datasets, we take the samples that are longer than seq_len while True: i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if padding: + trainenc = tokenizer(traindata[i]['text'], padding='max_length', truncation=True, max_length=seq_len, return_tensors='pt') + else: + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') if trainenc.input_ids.shape[1] >= seq_len: break - # then clip the samples to seq_len - i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1) - j = i + seq_len - inp = trainenc.input_ids[:, i:j] - tar = inp.clone() - tar[:, :-1] = -100 - trainloader.append((inp, tar)) + if not padding: + # then clip the samples to seq_len + i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1) + j = i + seq_len + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + else: + inp = trainenc.input_ids + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) if val_size is not None: valenc = tokenizer(' '.join(valdata[:val_size]['text']), return_tensors='pt') else: @@ -215,22 +227,40 @@ def get_loaders( return get_c4_new(nsamples, seed, seqlen, model) return get_c4(nsamples, seed, seqlen, model) if name == "answer_verification": - return get_jsonl(".cache/ni_calib/train/answer_verification.jsonl", ".cache/ni_calib/val/answer_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl( + ".cache/ni_calib/train/answer_verification.jsonl", + ".cache/ni_calib/test/answer_verification.jsonl", + nsamples, + seed, + seqlen, + model, + val_size=1000, + padding=True + ) if name == "coherence_classification": - return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/test/coherence_classification.jsonl", ".cache/ni_calib/test/coherence_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "commonsense_classification": - return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/train/commonsense_classification.jsonl", ".cache/ni_calib/test/commonsense_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "dialogue_state_tracking": - return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/train/dialogue_state_tracking.jsonl", ".cache/ni_calib/test/dialogue_state_tracking.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "fact_verification": - return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/train/fact_verification.jsonl", ".cache/ni_calib/test/fact_verification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "gender_classification": - return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/train/gender_classification.jsonl", ".cache/ni_calib/test/gender_classification.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "irony_detection": - return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/train/irony_detection.jsonl", ".cache/ni_calib/test/irony_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "stance_detection": - return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/train/stance_detection.jsonl", ".cache/ni_calib/test/stance_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "toxic_language_detection": - return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000) + return get_jsonl(".cache/ni_calib/train/toxic_language_detection.jsonl", ".cache/ni_calib/test/toxic_language_detection.jsonl", nsamples, seed, seqlen, model, val_size=1000, padding=True) if name == "word_semantics": - return get_jsonl(".cache/ni_calib/train/word_semantics.jsonl", ".cache/ni_calib/test/word_semantics.jsonl", nsamples, seed, seqlen, model, val_size=1000) \ No newline at end of file + return get_jsonl( + ".cache/ni_calib/train/word_semantics.jsonl", + ".cache/ni_calib/test/word_semantics.jsonl", + nsamples, + seed, + seqlen, + model, + val_size=1000, + padding=True + ) \ No newline at end of file diff --git a/opt_delta_autotune.py b/opt_delta_autotune.py index ac88b37..937ab7f 100644 --- a/opt_delta_autotune.py +++ b/opt_delta_autotune.py @@ -371,7 +371,6 @@ def sync(): if check: print("PPL:", torch.exp(tot / (input_ids.numel() - 1)).item()) - def main(args): print(args) num_params = 0 @@ -471,7 +470,6 @@ def main(args): parser.add_argument( "--dataset", type=str, - choices=["wikitext2", "ptb", "c4"], default="wikitext2", help="Where to extract calibration data from.", ) diff --git a/opt_eval_ppl.py b/opt_eval_ppl.py index bf41c5d..ca290b5 100644 --- a/opt_eval_ppl.py +++ b/opt_eval_ppl.py @@ -7,8 +7,6 @@ BENCHMARK = 2048 -dataset = 'wikitext2' - nsamples = 128 @torch.no_grad() @@ -104,23 +102,26 @@ def forward(self, inp, **kwargs): model.config.use_cache = use_cache return ppl.item() -models = os.listdir("outputs") +models = os.listdir(".cache/models") res = {} -models = [ - # 'facebook/opt-1.3b', - # 'facebook/opt-350m', - 'facebook/opt-2.7b', - # 'lnair/opt-350m-wikitext2', - # 'lnair/opt-1.3b-wikitext2', - 'lnair/opt-2.7b-wikitext2' -] +# models = [ +# # 'facebook/opt-1.3b', +# # 'facebook/opt-350m', +# 'facebook/opt-2.7b', +# # 'lnair/opt-350m-wikitext2', +# # 'lnair/opt-1.3b-wikitext2', +# 'lnair/opt-2.7b-wikitext2' +# ] for model_name in models: - # model_path = os.path.join("outputs", model_name) - model = get_opt(model_name) + dataset = model_name + model_path = os.path.join(".cache", "models", model_name) + model = get_opt(model_path) model.to("cuda") + print("model loaded") _, testloader = get_loaders( - dataset, nsamples=128, seed=0, model=model_name, seqlen=model.seqlen + dataset, nsamples=128, seed=0, model=model_path, seqlen=model.seqlen ) + print("data loaded") ppl = opt_eval(model, testloader, model.device) res[model_name] = ppl print(res) diff --git a/pack_utils_test.py b/pack_utils_test.py index efc7963..563f31d 100644 --- a/pack_utils_test.py +++ b/pack_utils_test.py @@ -8,7 +8,6 @@ if __name__=="__main__": torch.set_printoptions(precision=4) b = torch.rand((1, 1), dtype=torch.float32) - print(b) # save b save_file({'wb1': b}, '.cache/original_b.safetensor') quantizer = Quantizer() diff --git a/ppl_res.json b/ppl_res.json new file mode 100644 index 0000000..f86c4f0 --- /dev/null +++ b/ppl_res.json @@ -0,0 +1 @@ +{"fact_verification": 7.487515449523926} \ No newline at end of file diff --git a/scripts/gptq_delta.sh b/scripts/gptq_delta.sh index 07e84be..ec3d1f2 100644 --- a/scripts/gptq_delta.sh +++ b/scripts/gptq_delta.sh @@ -1,7 +1,7 @@ python opt_delta_autotune.py \ - --dataset wikitext2 \ - --base-model facebook/opt-350m \ - --model lnair/opt-350m-wikitext2 \ + --dataset answer_verification \ + --base-model facebook/opt-1.3b \ + --model .cache/models/answer_verification \ --delta \ --wbits 2 \ --tol 2 \