camenduru/Qwen-VL-Chat-colab

torch.cuda.OutOfMemoryError: CUDA out of memory while running colab demo

Maulik1528 opened this issue · 0 comments

Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 388, in call_prediction
output = await route_utils.call_process_api(
File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 217, in call_process_api
output = await app.get_blocks().process_api(
File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1553, in process_api
result = await self.call_function(
File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1191, in call_function
prediction = await anyio.to_thread.run_sync(
File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 659, in wrapper
response = f(*args, **kwargs)
File "/content/app.py", line 133, in predict
response, history = model.chat(tokenizer, message, history=history)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/modeling_qwen.py", line 939, in chat
outputs = self.generate(
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/modeling_qwen.py", line 1058, in generate
return super().generate(
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 1648, in generate
return self.sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2730, in sample
outputs = self(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/modeling_qwen.py", line 848, in forward
transformer_outputs = self.transformer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/modeling_qwen.py", line 565, in forward
images = self.visual.encode(images)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/visual.py", line 426, in encode
return self(images)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/visual.py", line 407, in forward
x = self.transformer(x)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/visual.py", line 328, in forward
x = r(x, attn_mask=attn_mask)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/visual.py", line 295, in forward
x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/visual.py", line 283, in attention
return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/83e84ace0177efafe5bd87674eebd2bb3b43f7a8/visual.py", line 194, in forward
mixed_x_layer = self.in_proj(query)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.75 GiB total capacity; 13.17 GiB already allocated; 18.81 MiB free; 13.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF