Source code for policy

"""Module containing parent class for policies.
"""

from __future__ import division

import numpy as np

import tools


[docs]class Policy:
    """Parent class for policies.

    Inherit this class to make a policy. Action selection is based on
    maintaining a ``pi`` array which holds action selection
    probabilities.

    Args: 
        action_space (numpy array of action): Numpy array containing
            all actions available to any agent.
        value_function (fun, optional): A function used by the Policy to 
            update values of pi. This is usually a value function 
            learned by a GVF.
        action_equality (fun, optional): The function used to compare
            two action objects to determine whether they are equivalent. 
            Returns True if the actions are equivalent and False 
            otherwise.
        feature_indices (numpy array of bool, optional): Indices of the
            feature vector corresponding to indices used by the
            :py:obj:`value_function`.

    Attributes:
        action_space (numpy array of action): Numpy array containing
            all actions available to any agent.
        value_function (fun): A function used by the Policy to 
            update values of pi. This is usually a value function 
            learned by a GVF.
        action_equality (fun): The function used to compare two 
            action objects to determine whether they are equivalent. 
            Returns True if the actions are equivalent and False 
            otherwise.
        feature_indices (numpy array of bool): The indices of the
            feature vector corresponding to the indices used by the
            :py:obj:`value_function`.
        pi (numpy array of float): Numpy array containing probabilities
            corresponding to the actions at the corresponding index in
            ``action_space``.
        last_index (int): The index of the last action chosen by the
            policy.
    """

    def __init__(self,
                 action_space,
                 feature_indices=None,
                 value_function=None,
                 action_equality=tools.equal_twists,
                 *args,
                 **kwargs):

        self.action_space = np.asarray(action_space)
        self.pi = np.ones(action_space.size) / action_space.size
        self.value_function = value_function
        self.action_equality = action_equality
        self.feature_indices = feature_indices
        self.last_index = 0

[docs]    def update(self, phi, observation, *args, **kwargs):
        """Updates the probilities of taking each action

        This function should be replaced when creating a new policy. It
        takes a state ``(phi, observation)`` and modifies the ``pi``
        array accordingly. 

        Args:
            phi (numpy array of bool): Binary feature vector.
            observation (dictionary): User-defined dictionary containing
                miscellaneous information about the state that should
                not be included in the feature vector ``phi``.
            *args: Ignored.
            **kwargs: Ignored. 
        """
        if self.value_function is not None:
            phi = phi[self.feature_indices]

            q_fun = np.vectorize(lambda a: self.value_function(phi, a))
            q_values = q_fun(self.action_space)

            self.pi = np.array(q_values) / q_values.sum()

[docs]    def get_probability(self, action, choice=True, *args, **kwargs):
        """Get the probability of taking the provided action.

        This function can usually be used without being overwritten.
        Throws an error if the provided action is not equal to an action
        in ``action_space`` according to ``action_equality``.

        Args:
            action (action): Find the probability of this action.
            choice (bool): If set to true, updates ``last_index``.
            *args: Ignored.
            **kwargs: Ignored. 

        Returns:
            Float from ``pi`` corresponding to ``action``. 
        """
        equal_action = lambda i: self.action_equality(action,
                                                      self.action_space[i])
        indices = list(filter(equal_action, range(self.action_space.size)))
        assert len(indices) > 0

        index = indices[0]

        if choice:
            self.last_index = index

        return self.pi[index]

[docs]    def choose_action(self, *args, **kwargs):
        """Updates ``last_index`` and chooses an action according to ``pi``.

        Args:
            *args: Ignored.
            **kwargs: Ignored.

        Returns:
            Action at the sampled index.
        """
        self.last_index = np.random.choice(self.action_space.size, p=self.pi)
        return self.action_space[self.last_index]