missing file local_embedding_path: embed_llama2.pt
Opened this issue · 17 comments
there is an error when trying to load the model
the error is in the model itself checkpoint = torch.load(local_embedding_path, map_location="cpu")['weight']
this apparently expects embed_llama2.pt not included in the repository, also not in the llama2 model files.
there is an error when trying to load the model the error is in the model itself checkpoint = torch.load(local_embedding_path, map_location="cpu")['weight'] this apparently expects embed_llama2.pt not included in the repository, also not in the llama2 model files.
Please refer to
wget https://huggingface.co/Dongchao/UniAudio/resolve/main/embed_llama2.pt
thanks for uploading the file and for the quick reply!
I believe there is an issue in the checkpoint you provided. according to the paper and the github code, the first codebook has the size 3248 (not being initialized with the llama2 codebook). however, the size of the 'quantizer.quantizers.0.codebook.weight' is 32000, same as the llama2 codebook. (parameter_dict['codec_model']['quantizer.quantizers.0.codebook.weight'].shape ). this results in an error. maybe this is not the correct checkpoint? https://huggingface.co/Dongchao/UniAudio/blob/main/llm3_codec_uni.pth
thanks for uploading the file and for the quick reply! I believe there is an issue in the checkpoint you provided. according to the paper and the github code, the first codebook has the size 3248 (not being initialized with the llama2 codebook). however, the size of the 'quantizer.quantizers.0.codebook.weight' is 32000, same as the llama2 codebook. (parameter_dict['codec_model']['quantizer.quantizers.0.codebook.weight'].shape ). this results in an error. maybe this is not the correct checkpoint? https://huggingface.co/Dongchao/UniAudio/blob/main/llm3_codec_uni.pth
Hi, sorry for the delay. let me check it again tomorrow.
thanks! it works with this checkpoint but with using the same codebook as llama for all the 3 levels. it would be good to have the model with the semantic RVQ.
thanks! it works with this checkpoint but with using the same codebook as llama for all the 3 levels. it would be good to have the model with the semantic RVQ.
Please refer to wget https://huggingface.co/Dongchao/UniAudio/resolve/main/3layer.pth
I think this one is the same as the above, level 0 has 32000 tokens not 3248
Hi, I have the same problem. Please give us the correct weight to try!
Thank you.
Same issue.
I also have the same issue
thanks! it works with this checkpoint but with using the same codebook as llama for all the 3 levels. it would be good to have the model with the semantic RVQ.
How you solve it? I'm getting same error
I checked the shapes and all codebooks have the same shapes
you can just remove the if for the first codebook
def test_llm_codec(wav,config):
from models.codec.MSCodec import MSCodecLM
from llama_inference.llama import Tokenizer
from transformers import LlamaTokenizer
import torchaudio
llama_model_path = '/home/marius/data/vicuna-7b-v1.5' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main
text_tokenizer = Tokenizer(model_path=os.path.join(llama_model_path ,"tokenizer.model"))
# load model
# codec_ckpt = '/home/marius/data/codecs/llm3_codec_uni.pth' # set the ckpt path
codec_ckpt = '/home/marius/data/codecs/3layer.pth' # set the ckpt path
device = 'cuda'
model = MSCodecLM(**config.model.generator.config)
parameter_dict = torch.load(codec_ckpt)
model.load_state_dict(parameter_dict['codec_model']) # load model
model.to(device)
model.eval()
# vq1_texts = np.load(os.path.join("models","codec","layer1.npy"), allow_pickle=True)
torchaudio.save(os.path.join("output","wav_original.wav"), wav, sample_rate=16000)
wav = wav.unsqueeze(1).to(device)
my_code = []
setence = ''
# encode
with torch.no_grad():
x, codes , _, _,_,_ = model(wav)
for kk, code in enumerate(codes):
for j in range(code.shape[1]):
# if kk==0:
# import pdb; pdb.set_trace()
# tmp = code[0,j].item() # index
# wo = vq1_texts[tmp] # get word
# real_code = text_tokenizer.encode(str(wo), bos=False, eos=False)
# my_code += real_code
# setence += ' ' + str(wo)
# else:
tmp = code[0,j].item()
wo = text_tokenizer.decode(tmp)
setence += ' ' + str(wo)
my_code.append(tmp)
# decode to wav
x = model.decode(codes)
### write to disk
torchaudio.save(os.path.join("output","wav_decoded.wav"), x.squeeze(0).cpu(), sample_rate=16000)
I checked the shapes and all codebooks have the same shapes you can just remove the if for the first codebook
def test_llm_codec(wav,config): from models.codec.MSCodec import MSCodecLM from llama_inference.llama import Tokenizer from transformers import LlamaTokenizer import torchaudio llama_model_path = '/home/marius/data/vicuna-7b-v1.5' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main text_tokenizer = Tokenizer(model_path=os.path.join(llama_model_path ,"tokenizer.model")) # load model # codec_ckpt = '/home/marius/data/codecs/llm3_codec_uni.pth' # set the ckpt path codec_ckpt = '/home/marius/data/codecs/3layer.pth' # set the ckpt path device = 'cuda' model = MSCodecLM(**config.model.generator.config) parameter_dict = torch.load(codec_ckpt) model.load_state_dict(parameter_dict['codec_model']) # load model model.to(device) model.eval() # vq1_texts = np.load(os.path.join("models","codec","layer1.npy"), allow_pickle=True) torchaudio.save(os.path.join("output","wav_original.wav"), wav, sample_rate=16000) wav = wav.unsqueeze(1).to(device) my_code = [] setence = '' # encode with torch.no_grad(): x, codes , _, _,_,_ = model(wav) for kk, code in enumerate(codes): for j in range(code.shape[1]): # if kk==0: # import pdb; pdb.set_trace() # tmp = code[0,j].item() # index # wo = vq1_texts[tmp] # get word # real_code = text_tokenizer.encode(str(wo), bos=False, eos=False) # my_code += real_code # setence += ' ' + str(wo) # else: tmp = code[0,j].item() wo = text_tokenizer.decode(tmp) setence += ' ' + str(wo) my_code.append(tmp) # decode to wav x = model.decode(codes) ### write to disk torchaudio.save(os.path.join("output","wav_decoded.wav"), x.squeeze(0).cpu(), sample_rate=16000)
Does the work good? What’re the results like?
I was not happy at all with the reconstruction on my dataset but it probably needs to be retrained for your problem/data
I checked the shapes and all codebooks have the same shapes you can just remove the if for the first codebook
def test_llm_codec(wav,config): from models.codec.MSCodec import MSCodecLM from llama_inference.llama import Tokenizer from transformers import LlamaTokenizer import torchaudio llama_model_path = '/home/marius/data/vicuna-7b-v1.5' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main text_tokenizer = Tokenizer(model_path=os.path.join(llama_model_path ,"tokenizer.model")) # load model # codec_ckpt = '/home/marius/data/codecs/llm3_codec_uni.pth' # set the ckpt path codec_ckpt = '/home/marius/data/codecs/3layer.pth' # set the ckpt path device = 'cuda' model = MSCodecLM(**config.model.generator.config) parameter_dict = torch.load(codec_ckpt) model.load_state_dict(parameter_dict['codec_model']) # load model model.to(device) model.eval() # vq1_texts = np.load(os.path.join("models","codec","layer1.npy"), allow_pickle=True) torchaudio.save(os.path.join("output","wav_original.wav"), wav, sample_rate=16000) wav = wav.unsqueeze(1).to(device) my_code = [] setence = '' # encode with torch.no_grad(): x, codes , _, _,_,_ = model(wav) for kk, code in enumerate(codes): for j in range(code.shape[1]): # if kk==0: # import pdb; pdb.set_trace() # tmp = code[0,j].item() # index # wo = vq1_texts[tmp] # get word # real_code = text_tokenizer.encode(str(wo), bos=False, eos=False) # my_code += real_code # setence += ' ' + str(wo) # else: tmp = code[0,j].item() wo = text_tokenizer.decode(tmp) setence += ' ' + str(wo) my_code.append(tmp) # decode to wav x = model.decode(codes) ### write to disk torchaudio.save(os.path.join("output","wav_decoded.wav"), x.squeeze(0).cpu(), sample_rate=16000)
Still having the same mismatch error
(ptca) root@1a8b85fbcd974ccbae73a8b4e81bc95c0004KU:/eph/nvme0/azureml/cr/j/802a86daee5540feb2a8dab7a6911b94/exe/wd/sara/LLM-Codec# python infer.py
Traceback (most recent call last):
File "infer.py", line 15, in <module>
model.load_state_dict(parameter_dict['codec_model']) # load model
File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/module.py", line 2041, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for MSCodecLM:
size mismatch for quantizer.quantizers.0.codebook.weight: copying a param with shape torch.Size([32000, 4096]) from checkpoint, the shape in current model is torch.Size([3248, 4096]).
And this my code:
import torch
from omegaconf import OmegaConf
from codec.MSCodec import MSCodecLM
from llama_inference.llama import Tokenizer, ModelArgs
llama_model_path = 'Llama-2-7b' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main
text_tokenizer = Tokenizer(model_path=llama_model_path + "/tokenizer.model")
# load model
vq_config_path = 'config.yaml'
codec_ckpt = 'llm3_codec_uni.pth' # set the ckpt path
device = 'cuda'
exp_model_config = OmegaConf.load('config.yaml')
model = MSCodecLM(**exp_model_config.generator.config)
parameter_dict = torch.load(codec_ckpt)
model.load_state_dict(parameter_dict['codec_model']) # load model
model.to(device)
model.eval()
vq1_texts = np.load("layer1.npy", allow_pickle=True)
wav_root = ''
wav, sr = torchaudio.load(wav_root)
if sr != 16000:
wav = convert_audio(wav, sr, 16000, 1)
wav = wav.unsqueeze(1).to(device)
my_code = []
setence = ''
# encode
with torch.no_grad():
x, codes , _, _,_,_ = model(wav)
for kk, code in enumerate(codes):
for j in range(code.shape[1]):
# if kk==0:
# tmp = code[0,j].item() # index
# wo = vq1_texts[tmp] # get word
# real_code = text_tokenizer.encode(str(wo), bos=False, eos=False)
# my_code += real_code
# setence += ' ' + str(wo)
# else:
tmp = code[0,j].item()
wo = self.text_tokenizer.decode(tmp)
setence += ' ' + str(wo)
my_code.append(tmp)
# decode to wav
x = model.decode(codes)
I think this one is the same as the above, level 0 has 32000 tokens not 3248
sorry. I donot get the message from github. I believe this version is right. I have double-check it.
wget https://huggingface.co/Dongchao/2024/resolve/main/semantic_acoustic.pth
I have update it again! Thank you all of issues.
I have update it again! Thank you all of issues.
Thanks a lot! the error of mismatch have been resolved.