How can I use zeronvs-diffusion (feed-forward model) directly without SDS distillation?

Question

How can I use zeronvs-diffusion (feed-forward model) directly without SDS distillation?

thucz opened this issue 8 months ago · 5 comments

thucz commented 8 months ago

Answer 1 · 2024-03-25T08:27:11.000Z

same question, hope for the reply

Answer 2 · 2024-03-25T08:33:00.000Z

Specifically, I only want to use the DDIM module mentioned in the paper, can you tell me how to do that?

Answer 3 · 2024-03-26T21:27:12.000Z

Can you try the following snippet?

from ldm.models.diffusion import options

options.LDM_DISTILLATION_ONLY = True

from threestudio.models.guidance import zero123_guidance
from omegaconf import OmegaConf

image_path = "your_path_here"
guidance_cfg = dict(
    pretrained_model_name_or_path= "your_path_here",
    pretrained_config= "your_path_here",
    guidance_scale= 7.5,
    cond_image_path =image_path,
    min_step_percent=[0,.75,.02,1000],
    max_step_percent=[1000, 0.98, 0.025, 2500],
    vram_O=False
)

guidance = zero123_guidance.Zero123Guidance(OmegaConf.create(guidance_cfg))
from PIL import Image
import numpy as np
import torch

cond_image_pil = Image.open(image_path).convert("RGB")
cond_image = torch.from_numpy(np.array(cond_image_pil)).cuda() / 255.

c_crossattn, c_concat = guidance.get_img_embeds(
    cond_image.permute((2, 0, 1))[None])

cond_camera = np.eye(4)  # identity camera pose
target_camera = cond_camera.copy()
target_camera[:3, -1] = np.array([.125, .125, .125])  # perturb the cond pose

target_camera = torch.from_numpy(target_camera[None]).cuda().to(torch.float32)
cond_camera = torch.from_numpy(cond_camera[None]).cuda().to(torch.float32)
camera_batch = {
    "target_cam2world": target_camera,
    "cond_cam2world": cond_camera,
    "fov_deg": torch.from_numpy(np.array([45.0])).cuda().to(torch.float32)
}

guidance.cfg.precomputed_scale=.7
cond = guidance.get_cond_from_known_camera(
    camera_batch,
    c_crossattn=c_crossattn,
    c_concat=c_concat,
    # precomputed_scale=.7,
)
novel_view = guidance.gen_from_cond(cond)
novel_view_pil = Image.fromarray(np.clip(novel_view[0]*255, 0, 255).astype(np.uint8))
display(cond_image_pil)
display(novel_view_pil)

Answer 4 · 2024-04-28T08:30:32.000Z

Can you try the following snippet?

from ldm.models.diffusion import options

options.LDM_DISTILLATION_ONLY = True

from threestudio.models.guidance import zero123_guidance
from omegaconf import OmegaConf

image_path = "your_path_here"
guidance_cfg = dict(
    pretrained_model_name_or_path= "your_path_here",
    pretrained_config= "your_path_here",
    guidance_scale= 7.5,
    cond_image_path =image_path,
    min_step_percent=[0,.75,.02,1000],
    max_step_percent=[1000, 0.98, 0.025, 2500],
    vram_O=False
)

guidance = zero123_guidance.Zero123Guidance(OmegaConf.create(guidance_cfg))
from PIL import Image
import numpy as np
import torch

cond_image_pil = Image.open(image_path).convert("RGB")
cond_image = torch.from_numpy(np.array(cond_image_pil)).cuda() / 255.

c_crossattn, c_concat = guidance.get_img_embeds(
    cond_image.permute((2, 0, 1))[None])

cond_camera = np.eye(4)  # identity camera pose
target_camera = cond_camera.copy()
target_camera[:3, -1] = np.array([.125, .125, .125])  # perturb the cond pose

target_camera = torch.from_numpy(target_camera[None]).cuda().to(torch.float32)
cond_camera = torch.from_numpy(cond_camera[None]).cuda().to(torch.float32)
camera_batch = {
    "target_cam2world": target_camera,
    "cond_cam2world": cond_camera,
    "fov_deg": torch.from_numpy(np.array([45.0])).cuda().to(torch.float32)
}

guidance.cfg.precomputed_scale=.7
cond = guidance.get_cond_from_known_camera(
    camera_batch,
    c_crossattn=c_crossattn,
    c_concat=c_concat,
    # precomputed_scale=.7,
)
novel_view = guidance.gen_from_cond(cond)
novel_view_pil = Image.fromarray(np.clip(novel_view[0]*255, 0, 255).astype(np.uint8))
display(cond_image_pil)
display(novel_view_pil)

Yes, this is indeed helpful! I do have another question though: What is the coordinate system used for cond_camera and target_camera? For instance, are they based on 'blender', 'opencv', 'colmap', or 'opengl'? I've experimented with both 'blender' and pytorch3d coordinates (since CO3D employs pytorch3d coordinates), but the results appear somewhat peculiar.
Looking forward to your response!

Answer 5 · 2024-04-30T19:41:14.000Z

The convention is opengl (x axis right, y axis up, z axis facing towards the viewer) in camera-to-world format.