RuntimeError: CUDA error: device-side assert triggered when running 2048.ipynb
Opened this issue · 1 comments
watemailunpi commented
RuntimeError occurs when running 2048.ipynb at this link
link: https://colab.research.google.com/github/openpipe/art/blob/main/examples/2048/2048.ipynb
loading model from .art/2048-multi-turn/models/agent-002/0010
==((====))== Unsloth 2025.5.1: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
\\ /| Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \ Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\ / Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
"-____-" Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
[/tmp/ipython-input-3745976913.py](https://localhost:8080/#) in <cell line: 0>()
9 print(f"loading model from {lora_model_path}\n")
10
---> 11 peft_model, tokenizer = FastLanguageModel.from_pretrained(
12 model_name=lora_model_path,
13 max_seq_length=16384,
9 frames
[/usr/local/lib/python3.11/dist-packages/unsloth/models/llama.py](https://localhost:8080/#) in _set_cos_sin_cache(self, seq_len, device, dtype)
1266 # Different from paper, but it uses a different permutation in order to obtain the same calculation
1267 emb = torch.cat((freqs, freqs), dim=-1)
-> 1268 self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
1269 self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
1270 pass
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
watemailunpi commented
complete logs as follows:
loading model from .art/2048-multi-turn/models/agent-002/0010
==((====))== Unsloth 2025.5.1: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
\\ /| Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \ Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\ / Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
"-____-" Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
[/tmp/ipython-input-3745976913.py](https://localhost:8080/#) in <cell line: 0>()
9 print(f"loading model from {lora_model_path}\n")
10
---> 11 peft_model, tokenizer = FastLanguageModel.from_pretrained(
12 model_name=lora_model_path,
13 max_seq_length=16384,
9 frames
[/usr/local/lib/python3.11/dist-packages/unsloth/models/loader.py](https://localhost:8080/#) in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, use_exact_model_name, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, *args, **kwargs)
374 pass
375
--> 376 model, tokenizer = dispatch_model.from_pretrained(
377 model_name = model_name,
378 max_seq_length = max_seq_length,
[/usr/local/lib/python3.11/dist-packages/unsloth/models/qwen2.py](https://localhost:8080/#) in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, model_patcher, tokenizer_name, trust_remote_code, **kwargs)
85 **kwargs,
86 ):
---> 87 return FastLlamaModel.from_pretrained(
88 model_name = model_name,
89 max_seq_length = max_seq_length,
[/usr/local/lib/python3.11/dist-packages/unsloth/models/llama.py](https://localhost:8080/#) in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, model_patcher, tokenizer_name, trust_remote_code, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, **kwargs)
1786
1787 if not fast_inference:
-> 1788 model = AutoModelForCausalLM.from_pretrained(
1789 model_name,
1790 device_map = device_map,
[/usr/local/lib/python3.11/dist-packages/transformers/models/auto/auto_factory.py](https://localhost:8080/#) in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
569 if model_class.config_class == config.sub_configs.get("text_config", None):
570 config = config.get_text_config()
--> 571 return model_class.from_pretrained(
572 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
573 )
[/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py](https://localhost:8080/#) in _wrapper(*args, **kwargs)
277 old_dtype = torch.get_default_dtype()
278 try:
--> 279 return func(*args, **kwargs)
280 finally:
281 torch.set_default_dtype(old_dtype)
[/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py](https://localhost:8080/#) in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
4340 with ContextManagers(model_init_context):
4341 # Let's make sure we don't run the init function of buffer modules
-> 4342 model = cls(config, *model_args, **model_kwargs)
4343
4344 # Make sure to tie the weights correctly
[/usr/local/lib/python3.11/dist-packages/transformers/models/qwen2/modeling_qwen2.py](https://localhost:8080/#) in __init__(self, config)
740 def __init__(self, config):
741 super().__init__(config)
--> 742 self.model = Qwen2Model(config)
743 self.vocab_size = config.vocab_size
744 self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
[/usr/local/lib/python3.11/dist-packages/transformers/models/qwen2/modeling_qwen2.py](https://localhost:8080/#) in __init__(self, config)
456 )
457 self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
--> 458 self.rotary_emb = Qwen2RotaryEmbedding(config=config)
459 self.gradient_checkpointing = False
460
[/usr/local/lib/python3.11/dist-packages/unsloth/models/llama.py](https://localhost:8080/#) in __init__(self, dim, max_position_embeddings, base, device, config)
1251
1252 # Build here to make `torch.jit.trace` work.
-> 1253 self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
1254 pass
1255
[/usr/local/lib/python3.11/dist-packages/unsloth/models/llama.py](https://localhost:8080/#) in _set_cos_sin_cache(self, seq_len, device, dtype)
1266 # Different from paper, but it uses a different permutation in order to obtain the same calculation
1267 emb = torch.cat((freqs, freqs), dim=-1)
-> 1268 self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
1269 self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
1270 pass
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.