yangdongchao/LLM-Codec

missing file local_embedding_path: embed_llama2.pt

Opened this issue · 17 comments

there is an error when trying to load the model
the error is in the model itself checkpoint = torch.load(local_embedding_path, map_location="cpu")['weight']
this apparently expects embed_llama2.pt not included in the repository, also not in the llama2 model files.

there is an error when trying to load the model the error is in the model itself checkpoint = torch.load(local_embedding_path, map_location="cpu")['weight'] this apparently expects embed_llama2.pt not included in the repository, also not in the llama2 model files.

Please refer to
wget https://huggingface.co/Dongchao/UniAudio/resolve/main/embed_llama2.pt

thanks for uploading the file and for the quick reply!
I believe there is an issue in the checkpoint you provided. according to the paper and the github code, the first codebook has the size 3248 (not being initialized with the llama2 codebook). however, the size of the 'quantizer.quantizers.0.codebook.weight' is 32000, same as the llama2 codebook. (parameter_dict['codec_model']['quantizer.quantizers.0.codebook.weight'].shape ). this results in an error. maybe this is not the correct checkpoint? https://huggingface.co/Dongchao/UniAudio/blob/main/llm3_codec_uni.pth

thanks for uploading the file and for the quick reply! I believe there is an issue in the checkpoint you provided. according to the paper and the github code, the first codebook has the size 3248 (not being initialized with the llama2 codebook). however, the size of the 'quantizer.quantizers.0.codebook.weight' is 32000, same as the llama2 codebook. (parameter_dict['codec_model']['quantizer.quantizers.0.codebook.weight'].shape ). this results in an error. maybe this is not the correct checkpoint? https://huggingface.co/Dongchao/UniAudio/blob/main/llm3_codec_uni.pth

Hi, sorry for the delay. let me check it again tomorrow.

thanks! it works with this checkpoint but with using the same codebook as llama for all the 3 levels. it would be good to have the model with the semantic RVQ.

thanks! it works with this checkpoint but with using the same codebook as llama for all the 3 levels. it would be good to have the model with the semantic RVQ.

Please refer to wget https://huggingface.co/Dongchao/UniAudio/resolve/main/3layer.pth

I think this one is the same as the above, level 0 has 32000 tokens not 3248

Hi, I have the same problem. Please give us the correct weight to try!
Thank you.

Same issue.

I also have the same issue

thanks! it works with this checkpoint but with using the same codebook as llama for all the 3 levels. it would be good to have the model with the semantic RVQ.

How you solve it? I'm getting same error

I checked the shapes and all codebooks have the same shapes
you can just remove the if for the first codebook

def test_llm_codec(wav,config):
    from models.codec.MSCodec import MSCodecLM
    from llama_inference.llama import Tokenizer
    from transformers import LlamaTokenizer
    import torchaudio
    llama_model_path = '/home/marius/data/vicuna-7b-v1.5' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main
    text_tokenizer = Tokenizer(model_path=os.path.join(llama_model_path ,"tokenizer.model"))
    # load model
    # codec_ckpt = '/home/marius/data/codecs/llm3_codec_uni.pth' # set the ckpt path
    codec_ckpt = '/home/marius/data/codecs/3layer.pth' # set the ckpt path
    device = 'cuda'
    model = MSCodecLM(**config.model.generator.config)  
    parameter_dict = torch.load(codec_ckpt)
    model.load_state_dict(parameter_dict['codec_model']) # load model
    model.to(device)
    model.eval()
    # vq1_texts = np.load(os.path.join("models","codec","layer1.npy"), allow_pickle=True)
    torchaudio.save(os.path.join("output","wav_original.wav"), wav, sample_rate=16000)
    wav = wav.unsqueeze(1).to(device)
    my_code = []
    setence = ''
    # encode
    with torch.no_grad():
        x, codes , _, _,_,_ = model(wav)
        for kk, code in enumerate(codes):
            for j in range(code.shape[1]):
                # if kk==0:
                #     import pdb; pdb.set_trace()
                #     tmp = code[0,j].item() # index
                #     wo = vq1_texts[tmp] # get word
                #     real_code = text_tokenizer.encode(str(wo), bos=False, eos=False)
                #     my_code += real_code
                #     setence += ' ' + str(wo)
                # else:
                tmp = code[0,j].item()
                wo = text_tokenizer.decode(tmp)
                setence += ' ' + str(wo)
                my_code.append(tmp)
    # decode to wav
    x = model.decode(codes)
    ### write to disk
    torchaudio.save(os.path.join("output","wav_decoded.wav"), x.squeeze(0).cpu(), sample_rate=16000)

I checked the shapes and all codebooks have the same shapes you can just remove the if for the first codebook

def test_llm_codec(wav,config):
    from models.codec.MSCodec import MSCodecLM
    from llama_inference.llama import Tokenizer
    from transformers import LlamaTokenizer
    import torchaudio
    llama_model_path = '/home/marius/data/vicuna-7b-v1.5' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main
    text_tokenizer = Tokenizer(model_path=os.path.join(llama_model_path ,"tokenizer.model"))
    # load model
    # codec_ckpt = '/home/marius/data/codecs/llm3_codec_uni.pth' # set the ckpt path
    codec_ckpt = '/home/marius/data/codecs/3layer.pth' # set the ckpt path
    device = 'cuda'
    model = MSCodecLM(**config.model.generator.config)  
    parameter_dict = torch.load(codec_ckpt)
    model.load_state_dict(parameter_dict['codec_model']) # load model
    model.to(device)
    model.eval()
    # vq1_texts = np.load(os.path.join("models","codec","layer1.npy"), allow_pickle=True)
    torchaudio.save(os.path.join("output","wav_original.wav"), wav, sample_rate=16000)
    wav = wav.unsqueeze(1).to(device)
    my_code = []
    setence = ''
    # encode
    with torch.no_grad():
        x, codes , _, _,_,_ = model(wav)
        for kk, code in enumerate(codes):
            for j in range(code.shape[1]):
                # if kk==0:
                #     import pdb; pdb.set_trace()
                #     tmp = code[0,j].item() # index
                #     wo = vq1_texts[tmp] # get word
                #     real_code = text_tokenizer.encode(str(wo), bos=False, eos=False)
                #     my_code += real_code
                #     setence += ' ' + str(wo)
                # else:
                tmp = code[0,j].item()
                wo = text_tokenizer.decode(tmp)
                setence += ' ' + str(wo)
                my_code.append(tmp)
    # decode to wav
    x = model.decode(codes)
    ### write to disk
    torchaudio.save(os.path.join("output","wav_decoded.wav"), x.squeeze(0).cpu(), sample_rate=16000)

Does the work good? What’re the results like?

I was not happy at all with the reconstruction on my dataset but it probably needs to be retrained for your problem/data

I checked the shapes and all codebooks have the same shapes you can just remove the if for the first codebook

def test_llm_codec(wav,config):
    from models.codec.MSCodec import MSCodecLM
    from llama_inference.llama import Tokenizer
    from transformers import LlamaTokenizer
    import torchaudio
    llama_model_path = '/home/marius/data/vicuna-7b-v1.5' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main
    text_tokenizer = Tokenizer(model_path=os.path.join(llama_model_path ,"tokenizer.model"))
    # load model
    # codec_ckpt = '/home/marius/data/codecs/llm3_codec_uni.pth' # set the ckpt path
    codec_ckpt = '/home/marius/data/codecs/3layer.pth' # set the ckpt path
    device = 'cuda'
    model = MSCodecLM(**config.model.generator.config)  
    parameter_dict = torch.load(codec_ckpt)
    model.load_state_dict(parameter_dict['codec_model']) # load model
    model.to(device)
    model.eval()
    # vq1_texts = np.load(os.path.join("models","codec","layer1.npy"), allow_pickle=True)
    torchaudio.save(os.path.join("output","wav_original.wav"), wav, sample_rate=16000)
    wav = wav.unsqueeze(1).to(device)
    my_code = []
    setence = ''
    # encode
    with torch.no_grad():
        x, codes , _, _,_,_ = model(wav)
        for kk, code in enumerate(codes):
            for j in range(code.shape[1]):
                # if kk==0:
                #     import pdb; pdb.set_trace()
                #     tmp = code[0,j].item() # index
                #     wo = vq1_texts[tmp] # get word
                #     real_code = text_tokenizer.encode(str(wo), bos=False, eos=False)
                #     my_code += real_code
                #     setence += ' ' + str(wo)
                # else:
                tmp = code[0,j].item()
                wo = text_tokenizer.decode(tmp)
                setence += ' ' + str(wo)
                my_code.append(tmp)
    # decode to wav
    x = model.decode(codes)
    ### write to disk
    torchaudio.save(os.path.join("output","wav_decoded.wav"), x.squeeze(0).cpu(), sample_rate=16000)

Still having the same mismatch error

(ptca) root@1a8b85fbcd974ccbae73a8b4e81bc95c0004KU:/eph/nvme0/azureml/cr/j/802a86daee5540feb2a8dab7a6911b94/exe/wd/sara/LLM-Codec# python infer.py
Traceback (most recent call last):
  File "infer.py", line 15, in <module>
    model.load_state_dict(parameter_dict['codec_model']) # load model
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/module.py", line 2041, in load_state_dict
    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for MSCodecLM:
        size mismatch for quantizer.quantizers.0.codebook.weight: copying a param with shape torch.Size([32000, 4096]) from checkpoint, the shape in current model is torch.Size([3248, 4096]).

And this my code:

import torch
from omegaconf import OmegaConf
from codec.MSCodec import MSCodecLM
from llama_inference.llama import Tokenizer, ModelArgs

llama_model_path = 'Llama-2-7b' # download llama 2 7B from https://github.com/meta-llama/llama-recipes/tree/main
text_tokenizer = Tokenizer(model_path=llama_model_path + "/tokenizer.model")
# load model
vq_config_path = 'config.yaml'
codec_ckpt = 'llm3_codec_uni.pth' # set the ckpt path
device = 'cuda'
exp_model_config = OmegaConf.load('config.yaml')
model = MSCodecLM(**exp_model_config.generator.config)  
parameter_dict = torch.load(codec_ckpt)
model.load_state_dict(parameter_dict['codec_model']) # load model
model.to(device)
model.eval()
vq1_texts = np.load("layer1.npy", allow_pickle=True)
wav_root = ''
wav, sr = torchaudio.load(wav_root)
if sr != 16000:
    wav = convert_audio(wav, sr, 16000, 1)
wav = wav.unsqueeze(1).to(device)
my_code = []
setence = ''
# encode
with torch.no_grad():
    x, codes , _, _,_,_ = model(wav)
    for kk, code in enumerate(codes):
        for j in range(code.shape[1]):
            # if kk==0:
            #     tmp = code[0,j].item() # index
            #     wo = vq1_texts[tmp] # get word
            #     real_code = text_tokenizer.encode(str(wo), bos=False, eos=False)
            #     my_code += real_code
            #     setence += ' ' + str(wo)
            # else:
            tmp = code[0,j].item()
            wo = self.text_tokenizer.decode(tmp)
            setence += ' ' + str(wo)
            my_code.append(tmp)
# decode to wav
x = model.decode(codes)

I think this one is the same as the above, level 0 has 32000 tokens not 3248

sorry. I donot get the message from github. I believe this version is right. I have double-check it.
wget https://huggingface.co/Dongchao/2024/resolve/main/semantic_acoustic.pth

I have update it again! Thank you all of issues.

I have update it again! Thank you all of issues.

Thanks a lot! the error of mismatch have been resolved.