Unable to get the same results while formulating differently
rohitdavas opened this issue · 1 comments
rohitdavas commented
I have been trying to write my own version of the code for figure 2.2 but I am unable to get the required type of graph.
Ofcourse, your code is working fine. I was taking help of your code and trying to formulate as Environment - agent interaction rather than your code which is framing both in one class.
Can you see what should I improve ? I am not able to find mistake.
#graph support
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
class Agent:
''' agent for bandit env
agent have following responsibilities
- choosing the best action
- updating the action value
'''
def __init__ ( self, epsilon = 0, n_actions = 10 ):
self.epsilon = epsilon
self.q = np.zeros ( n_actions )
self.q_steps = np.zeros_like ( self.q )
self.n_actions = n_actions
def action ( self ) :
if np.random.randn () < self.epsilon :
return np.random.choice ( self.n_actions )
else:
q_best = np.max ( self.q )
return np.random.choice ( np.where ( self.q == q_best)[0] )
def _sampleAverage( self, action, reward ):
step_size = self.q_steps[action]
self.q[action] += ( reward - self.q[action] ) / step_size
def reset(self):
self.q = np.zeros ( self.n_actions )
self.q_steps = np.zeros_like ( self.q )
def update ( self, action, reward ):
self.q_steps[action] += 1
self._sampleAverage( action, reward )
class Env:
''' a k-arm bandit environment
environment has following beaviour
- given an action, return a reward
- there is no observation in bandits env
'''
def __init__( self, n_arm = 10) :
self.n = n_arm
def _init_reward ( self ):
# rewards are sampled from normal distribution of
# mean = mean q_star(At)
# variance = 1
# thus the need to initialise the self.q_star
self.q_star = np.random.randn ( self.n )
def reward ( self, action, var = 1 ) :
return var * np.random.randn() + self.q_star[action]
def reset( self ) :
self._init_reward ()
self.best_action = np.argmax ( self.q_star )
def step ( self, action ) :
return self.reward ( action )
#------------------------------------------
# main function to simulate the behavior
#------------------------------------------
def single_run ( bandit_env, my_agent, steps = 1000 ):
my_agent.reset()
bandit_env.reset()
reward_series = np.zeros ( steps )
for i in range ( steps ):
action = my_agent.action()
reward = bandit_env.step(action)
reward_series[i] = reward
my_agent.update ( action, reward )
return reward_series
def q2( runs = 2000 ) :
epsilons = [ 0, 0.1, 0.01 ]
plt.figure( figsize= (10,20) )
plt.xlabel ( "steps")
plt.ylabel(" Average rewards")
for e in epsilons:
reward_series = []
my_agent = Agent(epsilon = e, n_actions= 10)
bandit_env = Env ( n_arm= 10)
for _ in tqdm ( range( runs ) ) :
reward_series.append ( single_run ( bandit_env, my_agent, steps = 1000 ) )
plt.plot ( np.mean( reward_series, axis = 0 ) , label = "epsilon " + str (e) )
plt.legend()
plt.savefig("./images/q2.png")
plt.close()
if __name__ == "__main__":
q2()
Here is the graph I am getting :
rohitdavas commented
I figured it out. thanks. closing it.
- random action sampling from uniform distribution.