conglu1997/nppac

How to improve the evaluation efficiency?

Closed this issue · 2 comments

I used the following code to evaluate the pretrained model, but found that the evaluation efficiency is too low (about 23min per episode). I wondered if there is anything wrong with the code? Or could you please provide a standard script for evaluation? Thanks for your help!

import mj_envs
import gym
import numpy as np
import torch
import gpytorch
from gp_models import MultitaskGPModel
from rlkit.torch.pytorch_util import set_gpu_mode
from tqdm import tqdm
import copy
import time
device = torch.device('cuda:1')

def rollout(
        env,
        agent,
        max_path_length=np.inf,
        render=False,
        render_kwargs=None,
        preprocess_obs_for_policy_fn=None,
        get_action_kwargs=None,
        return_dict_obs=False,
        full_o_postprocess_func=None,
        reset_callback=None,
):
    if render_kwargs is None:
        render_kwargs = {}
    if get_action_kwargs is None:
        get_action_kwargs = {}
    if preprocess_obs_for_policy_fn is None:
        preprocess_obs_for_policy_fn = lambda x: x
    raw_obs = []
    raw_next_obs = []
    observations = []
    actions = []
    rewards = []
    terminals = []
    dones = []
    agent_infos = []
    env_infos = []
    next_observations = []
    path_length = 0
    # agent.reset()
    o = env.reset()
    if reset_callback:
        reset_callback(env, agent, o)
    if render:
        # todo: debug
        env.mj_render()
        # env.render(**render_kwargs)
    while path_length < max_path_length:
        print('path_length:', path_length)
        raw_obs.append(o)
        # todo: debug

        # o_for_agent = torch.from_numpy(o).cuda().float().unsqueeze(0)

        o_torch = torch.from_numpy(np.array([o])).float().to(device)
        output = model(o_torch)
        observed_pred = likelihood(output)
        a = observed_pred.mean.data.cpu().numpy()

        if len(a) == 1:
            a = a[0]

        # # o_for_agent = o
        # # a = agent.get_action(o_for_agent, **get_action_kwargs)
        # a, *_ = agent(o_for_agent, **get_action_kwargs)
        # a = a.detach().cpu().numpy()
        # # a = agent.get_action(o_for_agent, **get_action_kwargs)[0][0]
        agent_info = None
        if full_o_postprocess_func:
            full_o_postprocess_func(env, agent, o)

        next_o, r, done, env_info = env.step(copy.deepcopy(a))
        if render:
            # todo: debug
            env.mj_render()

            # env.render(**render_kwargs)
        observations.append(o)
        rewards.append(r)
        terminal = False
        if done:
            # terminal=False if TimeLimit caused termination
            if not env_info.pop('TimeLimit.truncated', False):
                terminal = True
        terminals.append(terminal)
        dones.append(done)
        actions.append(a)
        next_observations.append(next_o)
        raw_next_obs.append(next_o)
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if done:
            break
        o = next_o
    actions = np.array(actions)
    if len(actions.shape) == 1:
        actions = np.expand_dims(actions, 1)
    observations = np.array(observations)
    next_observations = np.array(next_observations)
    if return_dict_obs:
        observations = raw_obs
        next_observations = raw_next_obs
    rewards = np.array(rewards)
    if len(rewards.shape) == 1:
        rewards = rewards.reshape(-1, 1)
    return dict(
        observations=observations,
        actions=actions,
        rewards=rewards,
        next_observations=next_observations,
        terminals=np.array(terminals).reshape(-1, 1),
        dones=np.array(dones).reshape(-1, 1),
        agent_infos=agent_infos,
        env_infos=env_infos,
        full_observations=raw_obs,
        full_next_observations=raw_obs,
    )


def simulate_policy(env, policy, T=100, H=200, gpu=True, render=False):
    if gpu:
        set_gpu_mode(True)
        # policy.cuda()
        policy.to(device)
        print('use GPU')
    # policy = MakeDeterministic(policy)
    episode = 0
    success_time = 0
    env.seed(1)
    for episode in tqdm(range(0, T)):
        print('episode:{}'.format(episode))
        path = rollout(
            env,
            policy,
            max_path_length=H,
            render=render,
        )
        if path['env_infos'][-1]['goal_achieved'] is True:
            success_time += 1
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        time.sleep(0.02)
    success_time /= episode
    return success_time



env = gym.make(f'door-binary-v0')

obs_dim = env.observation_space.low.size
action_dim = env.action_space.low.size
data_set = '../d4rl_model/offpolicy_hand_data/door2_sparse.npy'
model_path = '../nppac/nppac/door/gp_door_multitask_1000.pt'

data = np.load(data_set, allow_pickle=True)
keep_num = 1000
use_ard = True
gp_type = 'multitask'
gp_rank = 1
kernel_type = 'matern12'
# Ablation to randomly filter the dataset, not active by default.
if keep_num < len(data):
    print(f'Keeping {keep_num} trajectories.')
    data = np.random.choice(data, keep_num, replace=False)

if type(data[0]['observations'][0]) is dict:
    # Convert to just the states
    for traj in data:
        traj['observations'] = [t['state_observation'] for t in traj['observations']]

train_x = torch.from_numpy(np.array([j for i in [traj['observations'] for traj in data] for j in i])).float().to(
    device)
train_y = torch.from_numpy(np.array([j for i in [traj['actions'] for traj in data] for j in i])).float().to(
    device)

print('Data Loaded!')

# Initialize likelihood and model
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=action_dim).to(device)
likelihood.eval()
ard_num_dims = obs_dim if use_ard else None

model = MultitaskGPModel(train_x, train_y, likelihood, num_tasks=action_dim, rank=gp_rank,
                         ard_num_dims=ard_num_dims, kernel_type=kernel_type).to(device)

model_dict = torch.load(model_path, map_location=device)
model.load_state_dict(model_dict)
model.eval()


success_rate = simulate_policy(env, model, render=False, T=100)
print('success rate is :', success_rate)

What you have looks good, but missing

with torch.no_grad(), gpytorch.settings.fast_pred_var():

for the parts that evaluate the GP.

What you have looks good, but missing

with torch.no_grad(), gpytorch.settings.fast_pred_var():

for the parts that evaluate the GP.

Thanks for your help! I added with torch.no_grad(), gpytorch.settings.fast_pred_var(): and now the evaluation efficiency is much better!