Source code for wis_to_gtd

import numpy as np


[docs]class WISTOGTD:
    """Implements WIS-TO-GTD(lambda) with linear function approximation.

    See https://armahmood.github.io/files/MS-WIS-O(n)-UAI-2015.pdf for more
    details.

    Args:
        num_features (int): Length of weight vectors.
        u (float): Initial value for the usage vector. Can be interpreted as
            inverse initial step size.
        eta (float): Recency-weighting factor. Can be interpreted as desired
            final step size.
        beta (float): Secondary learning rate.
        lmbda (float): Trace decay rate.

    Attributes:
        theta: Primary weight vector.
        w: Secondary weight vector.
        e: Eligibility trace vector.
        u: Usage vector.
        v: Usage helper vector.
        beta: Secondary learning rate.
        lmbda: Trace decay rate.
        old_gamma: Discounting parameter from the previous timestep.
        old_rho: Importance sampling weight from previous timestep.
        delta: TD-error of previous timestep.
        tderr_elig: delta * e for RUPEE calculations.
    """

    def __init__(self,
                 num_features,
                 u,
                 eta,
                 beta,
                 lmbda,
                 **kwargs):
        self.e = np.zeros(num_features)
        self.theta = np.zeros(num_features)
        self.old_theta = np.zeros(num_features)
        self.u = np.ones(num_features) * u
        self.v = np.zeros(num_features)
        self.w = np.zeros(num_features)
        self.e_grad = np.zeros(num_features)
        self.e_w = np.zeros(num_features)

        assert beta > 0 and eta > 0 and u > 0

        self.beta = beta
        self.eta = eta
        self.old_lmbda = lmbda
        self.old_gamma = 0
        self.delta = 0
        self.old_rho = 1
        self.tderr_elig = np.zeros(num_features)

[docs]    def update(self, phi, phi_prime, cumulant, gamma, rho, **kwargs):

        lmbda = self.old_lmbda # replace this when lambda changes by state
        gam_lam = self.old_lmbda * self.old_gamma
        temp = self.theta

        phi_sq = phi * phi
        k = np.ones(phi.size) - self.eta * phi_sq
        self.u *= k
        self.u += rho * phi_sq + (rho - 1) * gam_lam * k * self.v

        self.v *= gam_lam * rho * k
        self.v += rho * phi_sq

        non_zero = self.u != 0
        alpha = np.ones(self.u.size)[non_zero]/self.u[non_zero]
        alpha[~non_zero] = 0

        self.e = (rho * alpha * phi +
                  (gam_lam * rho * (self.e - rho * alpha * phi) *
                   np.dot(phi,self.e)))
        self.e_grad = rho * (gam_lam * self.e_grad + phi)
        self.e_w = (gam_lam * self.old_rho * self.e_w +
                    (self.beta *
                     (1 - gam_lam * self.old_rho * np.dot(phi, self.e_w)) *
                     phi))

        self.delta = (cumulant + gamma * np.dot(phi_prime, self.theta) -
                      np.dot(phi, self.theta))
        self.tderr_elig = self.delta * self.e

        self.theta += (self.tderr_elig
                       + ((self.e - alpha * rho * phi) *
                          (self.theta - self.old_theta) * phi)
                       - (alpha * gamma * (1 - lmbda) *
                          np.dot(self.w, self.e_grad)) * phi_prime)
        self.w += (rho * self.delta * self.e_w
                   - self.beta * np.dot(self.w, phi) * phi)

        self.old_gamma = gamma
        self.old_lmbda = lmbda
        self.old_rho = rho
        self.old_theta = temp

        # for compatibility with calculating RUPEE for control gvfs
        return phi

[docs]    def predict(self, phi):
        return np.dot(phi, self.theta)