Rhythmblue/i3d_finetune

Preprocessing videos: extracting RGB images + flow data

Closed this issue · 11 comments

Thank you @Rhythmblue for great work!
I guess you preprocess the UCF videos upfront to extract RGB images + flow data. Can you share these as well? I am particularly interested in which tools you use to extract the flow data.
Thank you very much,
Frederik

Hi, Frederik Schorr:

A. I have use two ways to extract flow

  1. this is a repo which can extract flow and warped flow(which is modified by RANSAC to remove motion of background)
    And I use the branch of opencv3.1.
    https://github.com/yjxiong/dense_flow/tree/opencv-3.1
    (CPU or GPU)

  2. api of python-opencv
    you need to install by:

pip install opencv-python
pip install opencv-contrib-python

Then, you can use the TVL1 api to extract optical flow (Note: opencv-python cannot use GPU to compute flow. You can use multi-processing)
I add a py file in this mail.

B. There are some ways to extract rgb.
Sometimes I use ffmpeg. Sometimes use VideoCapture api of opencv-python.

Finally, we have create a new repo about i3d. You can try it.
https://github.com/USTC-Video-Understanding/I3D_Finetune

import os
import numpy as np
import cv2
from glob import glob
from multiprocessing import Pool


_IMAGE_SIZE = 256


def cal_for_frames(video_path):
    frames = glob(os.path.join(video_path, '*.jpg'))
    frames.sort()

    flow = []
    prev = cv2.imread(frames[0])
    prev = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
    for i, frame_curr in enumerate(frames):
        curr = cv2.imread(frame_curr)
        curr = cv2.cvtColor(curr, cv2.COLOR_BGR2GRAY)
        tmp_flow = compute_TVL1(prev, curr)
        flow.append(tmp_flow)
        prev = curr

    return flow


def compute_TVL1(prev, curr, bound=15):
    """Compute the TV-L1 optical flow."""
    TVL1 = cv2.DualTVL1OpticalFlow_create()
    flow = TVL1.calc(prev, curr, None)
    assert flow.dtype == np.float32

    flow = (flow + bound) * (255.0 / (2*bound))
    flow = np.round(flow).astype(int)
    flow[flow >= 255] = 255
    flow[flow <= 0] = 0

    return flow


def save_flow(video_flows, flow_path):
    for i, flow in enumerate(video_flows):
        cv2.imwrite(os.path.join(flow_path.format('u'), "{:06d}.jpg".format(i)),
                    flow[:, :, 0])
        cv2.imwrite(os.path.join(flow_path.format('v'), "{:06d}.jpg".format(i)),
                    flow[:, :, 1])


def gen_video_path():
    path = []
    flow_path = []
    length = []
    base = ''
    flow_base = ''
    for task in ['train', 'dev', 'test']:
        videos = os.listdir(os.path.join(base, task))
        for video in videos:
            tmp_path = os.path.join(base, task, video, '1')
            tmp_flow = os.path.join(flow_base, task, '{:s}', video)
            tmp_len = len(glob(os.path.join(tmp_path, '*.png')))
            u = False
            v = False
            if os.path.exists(tmp_flow.format('u')):
                if len(glob(os.path.join(tmp_flow.format('u'), '*.jpg'))) == tmp_len:
                    u = True
            else:
                os.makedirs(tmp_flow.format('u'))
            if os.path.exists(tmp_flow.format('v')):
                if len(glob(os.path.join(tmp_flow.format('v'), '*.jpg'))) == tmp_len:
                    v = True
            else:
                os.makedirs(tmp_flow.format('v'))
            if u and v:
                print('skip:' + tmp_flow)
                continue

            path.append(tmp_path)
            flow_path.append(tmp_flow)
            length.append(tmp_len)
    return path, flow_path, length


def extract_flow(args):
    video_path, flow_path = args
    flow = cal_for_frames(video_path)
    save_flow(flow, flow_path)
    print('complete:' + flow_path)
    return


if __name__ =='__main__':
    pool = Pool(2)   # multi-processing

    video_paths, flow_paths, video_lengths = gen_video_path()

    pool.map(extract_flow, zip(video_paths, flow_paths))

Thank you Rythmblue - this is exactly what I was looking for! I will re-use in a project on sign language recognition (for deaf-mute people). Thx! kind regards, Frederik

import os
import numpy as np
import cv2
from glob import glob
from multiprocessing import Pool


_IMAGE_SIZE = 256


def cal_for_frames(video_path):
    frames = glob(os.path.join(video_path, '*.jpg'))
    frames.sort()

    flow = []
    prev = cv2.imread(frames[0])
    prev = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
    for i, frame_curr in enumerate(frames):
        curr = cv2.imread(frame_curr)
        curr = cv2.cvtColor(curr, cv2.COLOR_BGR2GRAY)
        tmp_flow = compute_TVL1(prev, curr)
        flow.append(tmp_flow)
        prev = curr

    return flow


def compute_TVL1(prev, curr, bound=15):
    """Compute the TV-L1 optical flow."""
    TVL1 = cv2.DualTVL1OpticalFlow_create()
    flow = TVL1.calc(prev, curr, None)
    assert flow.dtype == np.float32

    flow = (flow + bound) * (255.0 / (2*bound))
    flow = np.round(flow).astype(int)
    flow[flow >= 255] = 255
    flow[flow <= 0] = 0

    return flow


def save_flow(video_flows, flow_path):
    for i, flow in enumerate(video_flows):
        cv2.imwrite(os.path.join(flow_path.format('u'), "{:06d}.jpg".format(i)),
                    flow[:, :, 0])
        cv2.imwrite(os.path.join(flow_path.format('v'), "{:06d}.jpg".format(i)),
                    flow[:, :, 1])


def gen_video_path():
    path = []
    flow_path = []
    length = []
    base = ''
    flow_base = ''
    for task in ['train', 'dev', 'test']:
        videos = os.listdir(os.path.join(base, task))
        for video in videos:
            tmp_path = os.path.join(base, task, video, '1')
            tmp_flow = os.path.join(flow_base, task, '{:s}', video)
            tmp_len = len(glob(os.path.join(tmp_path, '*.png')))
            u = False
            v = False
            if os.path.exists(tmp_flow.format('u')):
                if len(glob(os.path.join(tmp_flow.format('u'), '*.jpg'))) == tmp_len:
                    u = True
            else:
                os.makedirs(tmp_flow.format('u'))
            if os.path.exists(tmp_flow.format('v')):
                if len(glob(os.path.join(tmp_flow.format('v'), '*.jpg'))) == tmp_len:
                    v = True
            else:
                os.makedirs(tmp_flow.format('v'))
            if u and v:
                print('skip:' + tmp_flow)
                continue

            path.append(tmp_path)
            flow_path.append(tmp_flow)
            length.append(tmp_len)
    return path, flow_path, length


def extract_flow(args):
    video_path, flow_path = args
    flow = cal_for_frames(video_path)
    save_flow(flow, flow_path)
    print('complete:' + flow_path)
    return


if __name__ =='__main__':
    pool = Pool(2)   # multi-processing

    video_paths, flow_paths, video_lengths = gen_video_path()

    pool.map(extract_flow, zip(video_paths, flow_paths))

but , TVL1 = cv2.DualTVL1OpticalFlow_create() ,do not work in the lastest opencv-python==0.4.0

lastest
Do you know how to solve the problem?
"AttributeError: module 'cv2.cv2' has no attribute 'DualTVL1OpticalFlow_create"
Looking forward to your answer! Thank you

@jianweidong

  1. pip uninstall opencv-python
    pip install opencv-contrib-python
    reason in section 2.a
  2. modify TVL1 = cv2.DualTVL1OpticalFlow_create()
    to TVL1 = cv2.optflow.DualTVL1OpticalFlow_create()
    This way suits for version 4.x.x.

Hi Rhythmblue,
I am using the same code that you have given in your reply (https://github.com/USTC-Video-Understanding/I3D_Finetune) to fine tune UCF101 and HMDB51. When i have finetuned model of UCF101, the accuracy is as below
RGB data: 0.921, Flow data: 0.9984 and Fused: 0.792

With finetuned model of HMDB51, the accuracy is as below
RGB data: 0.7577, flow data: 0.6749 and fused: 0.5957
Note: i have generated flow data for HMDB51 using PWCNET

I am trying to understand why i am getting such a low accuracy rate using HMDB51, and high accuracy rate with UCF101 flow data, Could you plz suggest me any directions to find out the same?

Thanks,
Veeru.

@VeeranjaneyuluThoka ,sorry for the late response. I was busy doing experiments and tried to catch the deadline for a conference.
Now, I have difficulty to review the codes in TensorFlow. So I consider to re-implement i3d codes in PyTorch.

About your question, I think 0.99 of flow on UCF101 is an unexpected result. So, the code used for evaluation may have some problems. (Sometimes problem happens in the file list of video names and ground-truths. )

Hi @Rhythmblue , thank you for sharing the code to compute optical flow, could you explain more why you use the bound and also convert the flow to int?

Hi @Rhythmblue , thank you for sharing the code to compute optical flow, could you explain more why you use the bound and also convert the flow to int?

@yuanzhedong , I suggest that you can change bound to 20, following denseflow.

  1. These operations follow the settings of the paper TSN and the codes in denseflow.
  2. bound: limit the maximum movement of one pixel. It's an optional setting.
  3. int: before to int, it is first changed to 0-255. The reason for this operation is to save storage space of the flow. An int8 JPEG can save much more space than the original float32 array.
  4. Actually, the bound is also served to save the array of flow as a grey image. It can help limit the range and achieve normalization.

Hi @Rhythmblue , thank you for sharing the code to compute optical flow, could you explain more why you use the bound and also convert the flow to int?

@yuanzhedong , I suggest that you can change bound to 20, following denseflow.

  1. These operations follow the settings of the paper TSN and the codes in denseflow.
  2. bound: limit the maximum movement of one pixel. It's an optional setting.
  3. int: before to int, it is first changed to 0-255. The reason for this operation is to save storage space of the flow. An int8 JPEG can save much more space than the original float32 array.
  4. Actually, the bound is also served to save the array of flow as a grey image. It can help limit the range and achieve normalization.

Thank you for the detailed answer, it's very helpful!!