4/8-bit with bitsandbytes
datquocnguyen opened this issue · 1 comments
datquocnguyen commented
See: https://huggingface.co/docs/transformers/main/en/quantization#bitsandbytes
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
model_4bit = AutoModelForCausalLM.from_pretrained("vinai/PhoGPT-4B-Chat", quantization_config=quantization_config, device_map="auto", trust_remote_code=True)
Or:
import torch
from transformers import BitsAndBytesConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer
config = AutoConfig.from_pretrained("vinai/PhoGPT-4B-Chat", trust_remote_code=True)
config.init_device = "cuda"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
model_4bit = AutoModelForCausalLM.from_pretrained("vinai/PhoGPT-4B-Chat", quantization_config=quantization_config, config=config, trust_remote_code=True)
datquocnguyen commented
model_8bit = AutoModelForCausalLM.from_pretrained("vinai/PhoGPT-4B-Chat", device_map="auto", load_in_8bit=True, trust_remote_code=True)
Or:
config = AutoConfig.from_pretrained("vinai/PhoGPT-4B-Chat", trust_remote_code=True)
config.init_device = "cuda"
model_8bit = AutoModelForCausalLM.from_pretrained("vinai/PhoGPT-4B-Chat", config=config, load_in_8bit=True)