About the reconstruction result of iMAP method is not good
bilibilijin opened this issue · 3 comments
#14 Hi !Thanks for the steps you gave in the previous question, I tried following those suggestions, but still some problems are not solved effectively.
Question 1: I tried to reconstruct the TUM_RGBD dataset in iMAP mode. First, I referred to the dataset about this dataset in nice-slam, because there is no related processing for this dataset in vmap, I don’t know if I Whether the processing is correct, the result is as follows, there will be a lot of overlapping shadows, I don't know which step is wrong.
and this is code:
`class TUM_RGBD(Dataset):
def init(self, cfg):
self.imap_mode = cfg.imap_mode
self.root_dir = cfg.dataset_dir
self.color_paths, self.depth_paths, self.poses = self.loadtum(
self.root_dir,frame_rate=32)
self.inst_path = '../Detic/datasets/TUM_RGBD/output_dataset_freiburg2_xyz_instance/'
self.n_img = len(self.color_paths)
self.depth_transform = transforms.Compose(
[image_transforms.DepthScale(cfg.depth_scale),#depth * 1/1000.0
image_transforms.DepthFilter(cfg.max_depth)])#6.0 depth>6的为far_mask 将下标标记为far_mask的深度取0
self.distortion = np.array([0.2312, -0.7849, -0.0033, -0.0001, 0.9172])
self.W = cfg.W
self.H = cfg.H
self.fx = cfg.fx
self.fy = cfg.fy
self.cx = cfg.cx
self.cy = cfg.cy
self.edge = cfg.mw
self.crop_size = None
self.scale = 1.0
# background semantic classes: undefined--1, undefined-0 etc.
##################vmap_part##################################
self.background_cls_list = [255]
self.bbox_scale = 0.2
self.inst_dict = {}
##############################################################
def parse_list(self, filepath, skiprows=0):
""" read list data """
data = np.loadtxt(filepath, delimiter=' ',
dtype=np.unicode_, skiprows=skiprows)
return data
def associate_frames(self, tstamp_image, tstamp_depth, tstamp_pose, max_dt=0.08):
""" pair images, depths, and poses """
associations = []
for i, t in enumerate(tstamp_image):
if tstamp_pose is None:
j = np.argmin(np.abs(tstamp_depth - t))
if (np.abs(tstamp_depth[j] - t) < max_dt):
associations.append((i, j))
else:
j = np.argmin(np.abs(tstamp_depth - t))
k = np.argmin(np.abs(tstamp_pose - t))
if (np.abs(tstamp_depth[j] - t) < max_dt) and \
(np.abs(tstamp_pose[k] - t) < max_dt):
associations.append((i, j, k))
return associations
def loadtum(self, datapath, frame_rate=-1):
""" read video data in tum-rgbd format """
if os.path.isfile(os.path.join(datapath, 'groundtruth.txt')):
pose_list = os.path.join(datapath, 'groundtruth.txt')
elif os.path.isfile(os.path.join(datapath, 'pose.txt')):
pose_list = os.path.join(datapath, 'pose.txt')
image_list = os.path.join(datapath, 'rgb.txt')
depth_list = os.path.join(datapath, 'depth.txt')
image_data = self.parse_list(image_list)
depth_data = self.parse_list(depth_list)
pose_data = self.parse_list(pose_list, skiprows=1)
pose_vecs = pose_data[:, 1:].astype(np.float64)
tstamp_image = image_data[:, 0].astype(np.float64)
tstamp_depth = depth_data[:, 0].astype(np.float64)
tstamp_pose = pose_data[:, 0].astype(np.float64)
associations = self.associate_frames(
tstamp_image, tstamp_depth, tstamp_pose)
indicies = [0]
for i in range(1, len(associations)):
t0 = tstamp_image[associations[indicies[-1]][0]]
t1 = tstamp_image[associations[i][0]]
if t1 - t0 > 1.0 / frame_rate:
indicies += [i]
images, poses, depths, intrinsics = [], [], [], []
inv_pose = None
for ix in indicies:
(i, j, k) = associations[ix]
images += [os.path.join(datapath, image_data[i, 1])]
depths += [os.path.join(datapath, depth_data[j, 1])]
c2w = self.pose_matrix_from_quaternion(pose_vecs[k])
if inv_pose is None:
inv_pose = np.linalg.inv(c2w)
c2w = np.eye(4)
else:
c2w = inv_pose@c2w
# c2w[:3, 1] *= -1
# c2w[:3, 2] *= -1
c2w = torch.from_numpy(c2w).float()
poses += [c2w]
return images, depths, poses
def __len__(self):
# return len(os.listdir(os.path.join(self.root_dir, "depth")))
return self.n_img
def as_intrinsics_matrix(self, intrinsics):
"""
Get matrix representation of intrinsics.
"""
K = np.eye(3)
K[0, 0] = intrinsics[0]
K[1, 1] = intrinsics[1]
K[0, 2] = intrinsics[2]
K[1, 2] = intrinsics[3]
return K
def __getitem__(self, index):
color_path = self.color_paths[index]
depth_path = self.depth_paths[index]
(filepath, tempfilename) = os.path.split(color_path)
inst_path = self.inst_path + tempfilename
color_data = cv2.imread(color_path)
if '.png' in depth_path:
depth_data = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
elif '.exr' in depth_path:
depth_data = readEXR_onlydepth(depth_path)
if self.distortion is not None:
K = self.as_intrinsics_matrix([self.fx, self.fy, self.cx+self.edge, self.cy+self.edge])
# undistortion is only applied on color image, not depth!
color_data = cv2.undistort(color_data, K, self.distortion)
color_data = cv2.cvtColor(color_data, cv2.COLOR_BGR2RGB).transpose(1,0,2)
# color_data = color_data / 255.
depth_data = depth_data.astype(np.float32).transpose(1,0)
depth_data = np.nan_to_num(depth_data, nan=0.)
inst = cv2.imread(inst_path, cv2.IMREAD_UNCHANGED).astype(np.int32).transpose(1,0)
bbox_scale = self.bbox_scale
T = None
if self.poses is not None:
T = self.poses[index]
T = T.numpy()
if np.any(np.isinf(T)):
if index + 1 == self.__len__():
print("pose inf!")
return None
return self.__getitem__(index + 1)
H, W = depth_data.shape
# color_data = cv2.resize(color_data, (W, H), interpolation=cv2.INTER_LINEAR)
if self.edge:
edge = self.edge # crop image edge, there are invalid value on the edge of the color image
color_data = color_data[edge:-edge, edge:-edge]
depth_data = depth_data[edge:-edge, edge:-edge]
if self.depth_transform:
depth_data = self.depth_transform(depth_data)
bbox_dict = {}
if self.imap_mode:
obj = np.zeros_like(depth_data)
else:
inst_list = []
batch_masks = []
if self.edge:
edge = self.edge
inst = inst[edge:-edge, edge:-edge]
obj_ = np.zeros_like(inst)
for inst_id in np.unique(inst):
if inst_id in self.background_cls_list:
continue
obj_mask = inst == inst_id
batch_masks.append(obj_mask)
inst_list.append(inst_id)
if len(batch_masks) > 0:
batch_masks = torch.from_numpy(np.stack(batch_masks))
cmins, cmaxs, rmins, rmaxs = get_bbox2d_batch(batch_masks)
for i in range(batch_masks.shape[0]):
w = rmaxs[i] - rmins[i]
h = cmaxs[i] - cmins[i]
if w <= 10 or h <= 10: # too small#框太小则不作处理 todo
continue
bbox_enlarged = enlarge_bbox([rmins[i], cmins[i], rmaxs[i], cmaxs[i]], scale=bbox_scale,
w=inst.shape[1], h=inst.shape[0])
inst_id = inst_list[i]
obj_[batch_masks[i]] = 1 # 将batch_masks 为True的地方 在obj_中标记为1
bbox_dict.update({inst_id: torch.from_numpy(np.array(
[bbox_enlarged[1], bbox_enlarged[3], bbox_enlarged[0],
bbox_enlarged[2]]))}) # bbox order 创建一个bbox_dict 字典 键值对为 obj_id
inst[obj_ == 0] = 255#for background
obj = inst#obj 为对应类别, background 为255
bbox_dict.update({0: torch.from_numpy(np.array([int(0), int(obj.shape[0]), 0, int(obj.shape[1])]))})
# wrap data to frame dict
T_obj = np.eye(4)
depth_data = torch.from_numpy(depth_data) * self.scale
color_data = torch.from_numpy(color_data)
sample = {"image":color_data.type(torch.uint8), "depth": depth_data, "T": T, "T_obj": T_obj,"obj":obj,"bbox_dict":bbox_dict,"frame_id": index}
if color_data is None or depth_data is None:
print(color_path)
print(depth_path)
raise ValueError
return sample
def pose_matrix_from_quaternion(self, pvec):
""" convert 4x4 pose matrix to (t, q) """
from scipy.spatial.transform import Rotation
pose = np.eye(4)
pose[:3, :3] = Rotation.from_quat(pvec[3:]).as_matrix()
pose[:3, 3] = pvec[:3]
return pose`
Question 2: It is about obtaining the instance id and semantic id in Detic. I have debugged the detic code, but I can only get the semantic id of each object, that is, the category id. I would like to ask how to process it to get the instance id.
For the above problems, I hope you can give me some corrective opinions or steps, and look forward to your reply as soon as possible, which will be very grateful.
Hi,
- I'm not quite sure which part is wrong. But I suggest you accumulate the depth point clouds (or do TSDF-Fusion) and do a visualisation to check the depth scale, camera intrinsics and camera poses are good. For our evaluation on TUM RGB-D, we didn't use dataloader to read frames. Instead, we play the rosbag and use orb-slam-ros-wrapper to process the frames online, and vMAP will subscribe the rostopic from orb-slam to obtain the posed frames and updated keyframe poses (usually after orb-slam's local BA and global BA). So the mapping on TUM-RGBD data will be real-time and finished right after the rosbag playing finished.
- For integrating Detic, I think you can follow this demo. And the "instance" info is returned in the results python dictionary. After that, you should keep the semantic and instance info, and build the data association (based on semantic consistency and 3D overlap) by following this function. Or, I recently tried Video segmentation e.g., XMem as the data association tracker, that also works well.
Hope it helps and please let me know if it still cannot work well!
Hi,
- I'm not quite sure which part is wrong. But I suggest you accumulate the depth point clouds (or do TSDF-Fusion) and do a visualisation to check the depth scale, camera intrinsics and camera poses are good. For our evaluation on TUM RGB-D, we didn't use dataloader to read frames. Instead, we play the rosbag and use orb-slam-ros-wrapper to process the frames online, and vMAP will subscribe the rostopic from orb-slam to obtain the posed frames and updated keyframe poses (usually after orb-slam's local BA and global BA). So the mapping on TUM-RGBD data will be real-time and finished right after the rosbag playing finished.
- For integrating Detic, I think you can follow this demo. And the "instance" info is returned in the results python dictionary. After that, you should keep the semantic and instance info, and build the data association (based on semantic consistency and 3D overlap) by following this function. Or, I recently tried Video segmentation e.g., XMem as the data association tracker, that also works well.
Hope it helps and please let me know if it still cannot work well!
Thanks for your comments, I changed the depth scale in the TUM_RGBD dataset from 1000 to 5000, and got good results in the subsequent reconstruction, as shown below, I don't know whether such a reconstruction effect has reached your paper the effect shown in .
In addition, I still have some doubts. I tried to use RGBD camera to shoot my own data set, and used ColMAP to estimate the external parameters of the camera. I used Track-Anything to prepare the mask of the object, but both iMAP and vMAP were used for reconstruction. Less than expected effect. What I want to ask is:
Hi @bilibilijin thanks for your updates.
Yeah, I think the results you showed matches.
- For Replica and ScanNet we use and process all the frames sequentially. For TUM RGB-D and Our recordings with Kinect, we play ros-bag in real-time and report the reconstruction results when the sequence ends. The real-time processing is around 5Hz, indicating the frame taken rate is about every 0.2s. So the total frame numbers will be around {sequence_time/0.2s}. For example, the sequence time of our Kinect recordings is around 170s, so the total processed frames will be around 700.
- I don't think the pose needs to be super accurate as we adopt continually updated poses from ORB-SLAM3. COLMAP is definitely far accurate enough.