The load_video function does not seem right in eval_utils.py

Thank you for your contribution.
I found some confusing code here:

Lines 445 to 470 in 6f49fd2

    
           def load_video(self, video_path, num_segments=8, return_msg=False): 
        
               vr = VideoReader(video_path, ctx=cpu(0)) 
        
               num_frames = len(vr) 
        
               frame_indices = self.get_index(num_frames, num_segments) 
        
               duration = len(vr) // vr.get_avg_fps() 
        
               index = np.linspace(0, len(vr)-1, num=int(duration)) 
        
               buffer = vr.get_batch(index).asnumpy() 
        
               # transform 
        
               images_group = list() 
        
               for frame in buffer: 
        
                   img = Image.fromarray(frame) 
        
                   images_group.append(img) 
        
               images_group = list() 
        
               for frame_index in frame_indices: 
        
                   img = Image.fromarray(vr[frame_index].asnumpy()) 
        
                   images_group.append(img) 
        
               if return_msg: 
        
                   fps = float(vr.get_avg_fps()) 
        
                   sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) 
        
                   # " " should be added in the start and end 
        
                   msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." 
        
                   return images_group, msg 
        
               else: 
        
                   return images_group

Why do we need buffer?
The images_group is initialized again?

Secondly, what is the purpose of num_segments in get_index function? num_segments is being set to num_frames in upload_video

PLLaVA/tasks/eval/eval_utils.py

Line 473 in 6f49fd2

    
           num_segments = self.model.config.num_frames if num_segments is None else num_segments

Moreover, It seems like the offsets returned by the get_index are directly used as frame_indices.

PLLaVA/tasks/eval/eval_utils.py

Line 448 in 6f49fd2

frame_indices = self.get_index(num_frames, num_segments)

However, in the other evaluation scenarios, it is not the case e.g.,

PLLaVA/tasks/eval/eval_utils.py

Lines 298 to 322 in 6f49fd2

    
           def get_index(self, bound, fps, max_frame, first_idx=0): 
        
               if bound: 
        
                   start, end = bound[0], bound[1] 
        
               else: 
        
                   start, end = -100000, 100000 
        
               start_idx = max(first_idx, round(start * fps)) 
        
               end_idx = min(round(end * fps), max_frame) 
        
               seg_size = float(end_idx - start_idx) / self.num_segments 
        
               frame_indices = np.array([ 
        
                   int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) 
        
                   for idx in range(self.num_segments) 
        
               ]) 
        
               return frame_indices 
        
           def read_video(self, video_path, bound=None): 
        
               vr = VideoReader(video_path, ctx=cpu(0), num_threads=4) 
        
               max_frame = len(vr) - 1 
        
               fps = float(vr.get_avg_fps()) 
        
               images_group = list() 
        
               frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)  
        
               for frame_index in frame_indices: 
        
                   img = Image.fromarray(vr[frame_index].asnumpy()) 
        
                   images_group.append(img) 
        
               return images_group

	def load_video(self, video_path, num_segments=8, return_msg=False):
	vr = VideoReader(video_path, ctx=cpu(0))
	num_frames = len(vr)
	frame_indices = self.get_index(num_frames, num_segments)

	duration = len(vr) // vr.get_avg_fps()
	index = np.linspace(0, len(vr)-1, num=int(duration))
	buffer = vr.get_batch(index).asnumpy()
	# transform

	images_group = list()
	for frame in buffer:
	img = Image.fromarray(frame)
	images_group.append(img)
	images_group = list()
	for frame_index in frame_indices:
	img = Image.fromarray(vr[frame_index].asnumpy())
	images_group.append(img)
	if return_msg:
	fps = float(vr.get_avg_fps())
	sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
	# " " should be added in the start and end
	msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
	return images_group, msg
	else:
	return images_group

	def get_index(self, bound, fps, max_frame, first_idx=0):
	if bound:
	start, end = bound[0], bound[1]
	else:
	start, end = -100000, 100000
	start_idx = max(first_idx, round(start * fps))
	end_idx = min(round(end * fps), max_frame)
	seg_size = float(end_idx - start_idx) / self.num_segments
	frame_indices = np.array([
	int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
	for idx in range(self.num_segments)
	])
	return frame_indices

	def read_video(self, video_path, bound=None):
	vr = VideoReader(video_path, ctx=cpu(0), num_threads=4)
	max_frame = len(vr) - 1
	fps = float(vr.get_avg_fps())

	images_group = list()
	frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
	for frame_index in frame_indices:
	img = Image.fromarray(vr[frame_index].asnumpy())
	images_group.append(img)
	return images_group