Source code for wis_gtd

import numpy as np


[docs]class WISGTD: """Implements WIS-GTD(lambda) with linear function approximation. See https://armahmood.github.io/files/MS-WIS-O(n)-UAI-2015.pdf for more details. Args: num_features (int): Length of weight vectors. u (float): Initial value for the usage vector. Can be interpreted as inverse initial step size. eta (float): Recency-weighting factor. Can be interpreted as desired final step size. beta (float): Secondary learning rate. lmbda (float): Trace decay rate. Attributes: theta: Primary weight vector. w: Secondary weight vector. e: Eligibility trace vector. u: Usage vector. v: Usage helper vector. beta: Secondary learning rate. lmbda: Trace decay rate. old_gamma: Discounting parameter from the previous timestep. delta: TD-error of previous timestep. tderr_elig: delta * e for RUPEE calculations. """ def __init__(self, num_features, u, eta, beta, lmbda, **kwargs): self.e = np.zeros(num_features) self.theta = np.zeros(num_features) self.u = np.ones(num_features) * u self.v = np.zeros(num_features) self.w = np.zeros(num_features) assert beta > 0 and eta > 0 and u > 0 self.beta = beta self.eta = eta self.old_lmbda = lmbda self.old_gamma = 0 self.delta = 0 self.tderr_elig = np.zeros(num_features)
[docs] def update(self, phi, phi_prime, cumulant, gamma, rho, **kwargs): lmbda = self.old_lmbda # replace this when lambda changes by state gam_lam = self.old_lmbda * self.old_gamma phi_sq = phi * phi k = np.ones(phi.size) - self.eta * phi_sq self.u *= k self.u += rho * phi_sq + (rho - 1) * gam_lam * k * self.v self.v *= gam_lam * rho * k self.v += rho * phi_sq non_zero = self.u != 0 alpha = np.ones(self.u.size)[non_zero]/self.u[non_zero] alpha[~non_zero] = 0 self.delta = (cumulant + gamma * np.dot(phi_prime, self.theta) - np.dot(phi, self.theta)) self.e = rho * (gam_lam * self.e + phi) self.tderr_elig = self.delta * self.e self.theta += alpha * (self.tderr_elig - (gamma * (1 - lmbda) * np.dot(self.e, self.w) * phi_prime)) self.w += self.beta * (self.tderr_elig - np.dot(self.w, phi) * phi) self.old_gamma = gamma self.old_lmbda = lmbda # for compatibility with calculating RUPEE for control gvfs return phi
[docs] def predict(self, phi): return np.dot(phi, self.theta)