"""Module containing parent class for policies.
"""
from __future__ import division
import numpy as np
import tools
[docs]class Policy:
"""Parent class for policies.
Inherit this class to make a policy. Action selection is based on
maintaining a ``pi`` array which holds action selection
probabilities.
Args:
action_space (numpy array of action): Numpy array containing
all actions available to any agent.
value_function (fun, optional): A function used by the Policy to
update values of pi. This is usually a value function
learned by a GVF.
action_equality (fun, optional): The function used to compare
two action objects to determine whether they are equivalent.
Returns True if the actions are equivalent and False
otherwise.
feature_indices (numpy array of bool, optional): Indices of the
feature vector corresponding to indices used by the
:py:obj:`value_function`.
Attributes:
action_space (numpy array of action): Numpy array containing
all actions available to any agent.
value_function (fun): A function used by the Policy to
update values of pi. This is usually a value function
learned by a GVF.
action_equality (fun): The function used to compare two
action objects to determine whether they are equivalent.
Returns True if the actions are equivalent and False
otherwise.
feature_indices (numpy array of bool): The indices of the
feature vector corresponding to the indices used by the
:py:obj:`value_function`.
pi (numpy array of float): Numpy array containing probabilities
corresponding to the actions at the corresponding index in
``action_space``.
last_index (int): The index of the last action chosen by the
policy.
"""
def __init__(self,
action_space,
feature_indices=None,
value_function=None,
action_equality=tools.equal_twists,
*args,
**kwargs):
self.action_space = np.asarray(action_space)
self.pi = np.ones(action_space.size) / action_space.size
self.value_function = value_function
self.action_equality = action_equality
self.feature_indices = feature_indices
self.last_index = 0
[docs] def update(self, phi, observation, *args, **kwargs):
"""Updates the probilities of taking each action
This function should be replaced when creating a new policy. It
takes a state ``(phi, observation)`` and modifies the ``pi``
array accordingly.
Args:
phi (numpy array of bool): Binary feature vector.
observation (dictionary): User-defined dictionary containing
miscellaneous information about the state that should
not be included in the feature vector ``phi``.
*args: Ignored.
**kwargs: Ignored.
"""
if self.value_function is not None:
phi = phi[self.feature_indices]
q_fun = np.vectorize(lambda a: self.value_function(phi, a))
q_values = q_fun(self.action_space)
self.pi = np.array(q_values) / q_values.sum()
[docs] def get_probability(self, action, choice=True, *args, **kwargs):
"""Get the probability of taking the provided action.
This function can usually be used without being overwritten.
Throws an error if the provided action is not equal to an action
in ``action_space`` according to ``action_equality``.
Args:
action (action): Find the probability of this action.
choice (bool): If set to true, updates ``last_index``.
*args: Ignored.
**kwargs: Ignored.
Returns:
Float from ``pi`` corresponding to ``action``.
"""
equal_action = lambda i: self.action_equality(action,
self.action_space[i])
indices = list(filter(equal_action, range(self.action_space.size)))
assert len(indices) > 0
index = indices[0]
if choice:
self.last_index = index
return self.pi[index]
[docs] def choose_action(self, *args, **kwargs):
"""Updates ``last_index`` and chooses an action according to ``pi``.
Args:
*args: Ignored.
**kwargs: Ignored.
Returns:
Action at the sampled index.
"""
self.last_index = np.random.choice(self.action_space.size, p=self.pi)
return self.action_space[self.last_index]