magic-research/PLLaVA

The load_video function does not seem right in eval_utils.py

ahmadmobeen opened this issue · 0 comments

Thank you for your contribution.
I found some confusing code here:

def load_video(self, video_path, num_segments=8, return_msg=False):
vr = VideoReader(video_path, ctx=cpu(0))
num_frames = len(vr)
frame_indices = self.get_index(num_frames, num_segments)
duration = len(vr) // vr.get_avg_fps()
index = np.linspace(0, len(vr)-1, num=int(duration))
buffer = vr.get_batch(index).asnumpy()
# transform
images_group = list()
for frame in buffer:
img = Image.fromarray(frame)
images_group.append(img)
images_group = list()
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
images_group.append(img)
if return_msg:
fps = float(vr.get_avg_fps())
sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
# " " should be added in the start and end
msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
return images_group, msg
else:
return images_group

Why do we need buffer?
The images_group is initialized again?

Secondly, what is the purpose of num_segments in get_index function? num_segments is being set to num_frames in upload_video

num_segments = self.model.config.num_frames if num_segments is None else num_segments

Moreover, It seems like the offsets returned by the get_index are directly used as frame_indices.
frame_indices = self.get_index(num_frames, num_segments)

However, in the other evaluation scenarios, it is not the case e.g.,

def get_index(self, bound, fps, max_frame, first_idx=0):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / self.num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(self.num_segments)
])
return frame_indices
def read_video(self, video_path, bound=None):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=4)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
images_group = list()
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
images_group.append(img)
return images_group