Source code for to_gtd

import numpy as np

import tools


[docs]class TOGTD: """Implements True Online GTD(lambda) with linear function approximation. Args: num_features (int): Length of weight vectors. alpha (float): Primary learning rate. beta (float): Secondary learning rate. lmbda (float): Trace decay rate. decay (bool, optional): Whether to decay alpha and beta. Attributes: theta: Primary weight vector. w: Secondary weight vector. e: Eligibility trace vector. e_grad: Gradient correction trace vector. e_w: Secondary eligibility trace vector. alpha: Primary learning rate. beta: Secondary learning rate. lmbda: Trace decay rate. old_gamma: Discounting parameter from the previous timestep. delta: TD-error of previous timestep. tderr_elig: delta * e for RUPEE calculations. """ def __init__(self, num_features, alpha, beta, lmbda, decay=False, **kwargs): self.theta = np.zeros(num_features) self.old_theta = np.zeros(num_features) self.w = np.zeros(num_features) self.e = np.zeros(num_features) self.e_grad = np.zeros(num_features) self.e_w = np.zeros(num_features) self.alpha = tools.decay(alpha) if decay else tools.constant(alpha) self.beta = tools.decay(beta) if decay else tools.constant(beta) self.lmbda = lmbda self.old_gamma = 0 self.delta = 0 self.old_rho = 1 self.tderr_elig = np.zeros(num_features)
[docs] def update(self, phi, phi_prime, cumulant, gamma, rho, **kwargs): alpha = self.alpha.next() beta = self.beta.next() temp = self.theta self.delta = (cumulant + gamma * np.dot(phi_prime, self.theta) - np.dot(phi, self.theta)) self.e = rho * (self.old_gamma * self.lmbda * self.e + alpha * (1 - rho * self.old_gamma * self.lmbda * np.dot(phi, self.e)) * phi) self.e_grad = rho * (self.lmbda * self.old_gamma * self.e_grad + phi) self.e_w = ((self.old_rho * self.old_gamma * self.lmbda * self.e_w) + beta * (1 - self.old_rho * self.old_gamma * self.lmbda * np.dot(phi, self.e_w) * phi)) self.tderr_elig = self.delta * self.e self.theta += (self.tderr_elig + (self.e - alpha * rho * phi) * np.dot(self.theta - self.old_theta, phi) - alpha * gamma * (1 - self.lmbda) * np.dot(self.w, self.e_grad) * phi_prime) self.w += (rho * self.delta * self.e_w - beta * np.dot(phi, self.w) * phi) self.old_gamma = gamma self.old_rho = rho self.old_theta = temp # for compatibility with calculating RUPEE for control gvfs return phi
[docs] def predict(self, phi): return np.dot(phi, self.theta)