How to back-project an image to a 3D point cloud and render new views of the scene?
ugoleone opened this issue · 1 comments
I would like to render new views of a scene, starting from a single image, its depth map, and camera poses.
I have an image taken from LLFF dataset, the metric depth map relative to that image (in meters) and 5 camera poses provided by the LLFF dataset in the poses_bounds.npy
file (including the pose of the camera from which the image was taken). All files are attached.
Poses, as documented, are stored in the poses_bounds.npy
file in this way:
The pose matrix is a 3x4 camera-to-world affine transform concatenated with a 3x1 column
[image height, image width, focal length]
along axis=1.The rotation (first 3x3 block in the camera-to-world transform) is stored in a somewhat unusual order, which is why there are the transposes. From the point of view of the camera, the three axes are
[ down, right, backwards ]
which some people might consider to be[-y,x,z]
.
What I'm trying to do
- Create a camera for each pose (extrinsic and intrinsic camera parameters provided in
poses_bounds.npy
) - Back-project the image to a 3D point cloud (I got a metric depth value for each pixel)
- Render a image of the scene from each of the previously defined cameras
What I got
The images I'm obtaining are upside down, I suppose this is due to a wrong RT matrix conversion/multiplication
My code
I read here and here methods to convert poses for LLFF dataset, but using them I got worse results.
import cv2
import numpy as np
import open3d as o3d
import torch
from pytorch3d.renderer import (
PerspectiveCameras,
PointsRasterizationSettings,
PointsRenderer,
PointsRasterizer,
AlphaCompositor
)
from pytorch3d.structures import Pointclouds
from pytorch3d.io import IO
############################### Utility Functions ###############################
def convert_camera_pose(pose):
pose = torch.tensor(pose, dtype=torch.float32, device=torch.device("cuda:0"))
conv_matrix = torch.tensor([[0, -1, 0],
[1, 0, 0],
[0, 0, 1]
], dtype=torch.float32, device=torch.device("cuda:0"))
R = pose[:, :3]
T = pose[:, 3]
R = conv_matrix.T @ R @ conv_matrix
return R.unsqueeze(0), T.unsqueeze(0)
def image_generator(renderer, point_cloud):
gen_img = renderer(point_cloud.cuda()).permute(0, 3, 1, 2)
gen_img = gen_img[0].permute(1, 2, 0).cpu().numpy().astype(np.uint8)
print(f"gen_img max: {gen_img.max()} | gen_img min: {gen_img.min()} | gen_img shape: {gen_img.shape}")
return gen_img
############################### ################# ###############################
if __name__ == "__main__":
# Paths to image and depth file
image_path = "src/roasted_beef/0000.png"
depth_path = "src/roasted_beef/0000.npz"
pose_path = "src/roasted_beef/poses_bounds.npy"
# Load image
image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
h, w = image.shape[:2]
print(f"Image shape: {image.shape}")
# Load depth map (from meters to millimeters)
depthmap = np.load(depth_path)['depth']*1000
print(f"Depth map shape: {depthmap.shape}")
# Load pose data
# Poses are stored using [ down, right, backwards ] or [-y,x,z] in camera-to-world coordinates
pose_data = np.load(pose_path)
RT_0 = pose_data[:, :-2].reshape([-1, 3, 5])[0][:, :-1]
RT_1 = pose_data[:, :-2].reshape([-1, 3, 5])[1][:, :-1]
RT_2 = pose_data[:, :-2].reshape([-1, 3, 5])[2][:, :-1]
RT_3 = pose_data[:, :-2].reshape([-1, 3, 5])[3][:, :-1]
RT_4 = pose_data[:, :-2].reshape([-1, 3, 5])[4][:, :-1]
# Convert to pythorch3d camera-to-world pose
R_0, T_0 = convert_camera_pose(RT_0)
R_1, T_1 = convert_camera_pose(RT_1)
R_2, T_2 = convert_camera_pose(RT_2)
R_3, T_3 = convert_camera_pose(RT_3)
R_4, T_4 = convert_camera_pose(RT_4)
# Define camera parameters
focal_length = 731.3691627864987
principal_point = ((image.shape[1] / 2, image.shape[0] / 2),) #676.0 507.0
#principal_point = torch.FloatTensor([[0.0, 0.0]])
image_size = ((image.shape[0], image.shape[1]),)
print(f"Focal length: {focal_length} | Principal point: {principal_point} | Image size: {image_size}")
# Create Perspective cameras
camera = PerspectiveCameras(
focal_length=focal_length,
principal_point=principal_point,
image_size=image_size,
in_ndc=False,
R=R_0,
T=T_0,
device=torch.device("cuda:0"),
)
camera_1 = PerspectiveCameras(
focal_length=focal_length,
principal_point=principal_point,
image_size=image_size,
in_ndc=False,
R=R_1,
T=T_1,
device=torch.device("cuda:0"),
)
camera_2 = PerspectiveCameras(
focal_length=focal_length,
principal_point=principal_point,
image_size=image_size,
in_ndc=False,
R=R_2,
T=T_2,
device=torch.device("cuda:0"),
)
camera_3 = PerspectiveCameras(
focal_length=focal_length,
principal_point=principal_point,
image_size=image_size,
in_ndc=False,
R=R_3,
T=T_3,
device=torch.device("cuda:0"),
)
camera_4 = PerspectiveCameras(
focal_length=focal_length,
principal_point=principal_point,
image_size=image_size,
in_ndc=False,
R=R_4,
T=T_4,
device=torch.device("cuda:0"),
)
print("Perspective cameras created.")
# Image coordinates (u, v)
h, w = image.shape[:2]
u, v = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')
# Flatten the depth map and the mesh grid
depth_flat = torch.tensor(depthmap.flatten(), dtype=torch.float32, device=torch.device("cuda:0"))
u_flat = torch.tensor(u.flatten(), dtype=torch.float32, device=torch.device("cuda:0"))
v_flat = torch.tensor(v.flatten(), dtype=torch.float32, device=torch.device("cuda:0"))
# Points in screen coordinates (u, v, depth) [B, N, 3]
xy_depth = torch.stack((u_flat, v_flat, depth_flat)).permute(1, 0).unsqueeze(0)
# Unproject to the world coordinates
xyz_unproj_world = camera.unproject_points(xy_depth, world_coordinates=True, from_ndc=False)
# Create a point cloud using pytorch3d
colors_tensor = torch.tensor(image, dtype=torch.float32, device=torch.device("cuda:0")).view(-1, 3)
point_cloud = Pointclouds(points=[xyz_unproj_world[0, :, :]], features=[colors_tensor])
# Save the point cloud to a file
IO().save_pointcloud(point_cloud, "outs/TEST_point_cloud.ply")
# Rasterizer setup
raster_settings = PointsRasterizationSettings(
image_size=(h, w),
radius=0.01,
points_per_pixel=10
)
# Renderer setup
renderer = PointsRenderer(
rasterizer=PointsRasterizer(cameras=camera, raster_settings=raster_settings),
compositor=AlphaCompositor()
)
renderer_1 = PointsRenderer(
rasterizer=PointsRasterizer(cameras=camera_1, raster_settings=raster_settings),
compositor=AlphaCompositor()
)
renderer_2 = PointsRenderer(
rasterizer=PointsRasterizer(cameras=camera_2, raster_settings=raster_settings),
compositor=AlphaCompositor()
)
renderer_3 = PointsRenderer(
rasterizer=PointsRasterizer(cameras=camera_3, raster_settings=raster_settings),
compositor=AlphaCompositor()
)
renderer_4 = PointsRenderer(
rasterizer=PointsRasterizer(cameras=camera_4, raster_settings=raster_settings),
compositor=AlphaCompositor()
)
# Image generation
gen_img_0 = image_generator(renderer, point_cloud)
gen_img_1 = image_generator(renderer_1, point_cloud)
gen_img_2 = image_generator(renderer_2, point_cloud)
gen_img_3 = image_generator(renderer_3, point_cloud)
gen_img_4 = image_generator(renderer_4, point_cloud)
# Save rendered images
cv2.imwrite("outs/rendered_image_0.png", cv2.cvtColor(gen_img_0, cv2.COLOR_RGB2BGR))
cv2.imwrite("outs/rendered_image_1.png", cv2.cvtColor(gen_img_1, cv2.COLOR_RGB2BGR))
cv2.imwrite("outs/rendered_image_2.png", cv2.cvtColor(gen_img_2, cv2.COLOR_RGB2BGR))
cv2.imwrite("outs/rendered_image_3.png", cv2.cvtColor(gen_img_3, cv2.COLOR_RGB2BGR))
cv2.imwrite("outs/rendered_image_4.png", cv2.cvtColor(gen_img_4, cv2.COLOR_RGB2BGR))
I believe it is just a matter of the pose matrices and the use of back projection functions. I would appreciate any help on this, as I have not found a tutorial in the documentation for such a case.
The function get_rgbd_point_cloud is the recommended and friendly interface for backprojecting rgbd data through a camera.
By looking at the file https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/implicitron/dataset/llff_dataset_map_provider.py and the functions it calls you should have an example of how to load the llff cameras. You may well be doing this correctly anyway.