# -*- coding: utf-8 -*-
# ELEKTRONN2 Toolkit
# Copyright (c) 2015 Marius Killinger
# All rights reserved
from __future__ import absolute_import, division, print_function
from builtins import filter, hex, input, int, map, next, oct, pow, range, super, zip
import logging
import numpy as np
import theano.tensor as T
from . import graphutils
from . import variables
logger = logging.getLogger('elektronn2log')
[docs]class Optimiser(object):
global_lr = variables.VariableParam(value=1,
name='lr',
dtype=graphutils.floatX)
global_weight_decay = variables.VariableParam(value=0,
name='weight_decay',
dtype=graphutils.floatX)
global_mom = variables.VariableParam(value=0.9,
name='mom',
dtype=graphutils.floatX)
@classmethod
[docs] def setlr(cls, val):
"""
Set learning rate (global to all optimisers)
"""
val = graphutils.as_floatX(val)
cls.global_lr.set_value(val)
@classmethod
[docs] def setwd(cls, val):
"""
Set weight decay parameter (global to all optimisers)
"""
val = graphutils.as_floatX(val)
cls.global_weight_decay.set_value(val)
@classmethod
[docs] def setmom(cls, val):
"""
Set momentum parameter (global to all optimisers)
"""
val = graphutils.as_floatX(val)
cls.global_mom.set_value(val)
def __init__(self, inputs, loss, grads, params, additional_outputs):
if additional_outputs is None:
additional_outputs = []
self.meta_params = dict(lr=self.global_lr,
mom=self.global_mom,
wd=self.global_weight_decay)
self.input = inputs
self.output = [loss,] + additional_outputs
self.loss = loss
self.params = params
self.grads = grads
self.step = None
self.last_exec_time = None
self.last_dir = []
# the higher the index the older the params
self.params_cycler = [self.alloc_shared_grads(name_suffix='_lp_%i'%i)
for i in range(3)]
[docs] def alloc_shared_grads(self, name_suffix='_lg', init_val=0.0):
"""Returns new shared variables matching the shape of params/gradients"""
grads = []
for i, p in enumerate(self.params):
name = p.name+name_suffix
value = np.ones_like(p.get_value()) * graphutils.as_floatX(init_val)
g = variables.VariableParam(value=value, name=name)
grads.append(g)
return grads
[docs] def clear_last_dir(self, last_dir=None):
if last_dir is None:
last_dir = self.last_dir
for d in last_dir:
d.set_value(np.zeros(d.get_value().shape, dtype=d.dtype))
[docs] def get_rotational_updates(self):
updates = []
for x in zip(self.params, *self.params_cycler):
new_param, param_queue, = x[0], x[1:]
for i in range(len(param_queue)-1, 0, -1):
updates.append((param_queue[i], param_queue[i-1]))
updates.append((param_queue[0], new_param))
return updates
[docs] def repair_fuckup(self):
self.clear_last_dir()
for p, p_old in zip(self.params, self.params_cycler[-1]):
p.set_value(p_old.get_value())
def __call__(self, *args):
"""
Perform an update step
[data (,labels etc...)] --> [loss (, add. outputs...)]
"""
ret = list(self.step(*args))
ret[0] = graphutils.as_floatX(ret[0]) # the scalar loss
self.last_exec_time = self.step.last_exec_time
return ret
###############################################################################
[docs]class SGD(Optimiser):
def __init__(self, inputs, loss, grads, params, extra_updates,
additional_outputs=None):
super(SGD, self).__init__(inputs, loss, grads, params,
additional_outputs)
self.last_dir = self.alloc_shared_grads() # last direction os update
updates = []
for g, d, p in zip(self.grads, self.last_dir, self.params):
new_d = g + self.global_mom * d
if p.apply_reg:
if p.apply_reg > 1:
multiplier = graphutils.as_floatX(p.apply_reg)
new_p = p - self.global_lr * \
(new_d + self.global_weight_decay * p * multiplier)
else:
new_p = p - self.global_lr * \
(new_d + self.global_weight_decay * p)
else:
new_p = p - self.global_lr * new_d
updates.append((d, new_d))
updates.append((p, new_p))
updates.extend(extra_updates)
updates.extend(self.get_rotational_updates())
self.step = graphutils.make_func(self.input, self.output,
updates=updates, name='SGD step')
[docs]class AdaGrad(Optimiser):
# Tries to favor making faster progress on parameters with usually small
# gradients (but dos somehow ignore their actual direction, i.e. a parameter
# which has a lot of small gradients in the same direction and one that has
# many small gradients in opposite directions have both a high LR !
def __init__(self, inputs, loss, grads, params, extra_updates,
additional_outputs=None):
super(AdaGrad, self).__init__(inputs, loss, grads, params,
additional_outputs)
self._init_done = False
self.hs = self.alloc_shared_grads('_h', init_val=0.0)
updates = []
for g, h, p in zip(self.grads, self.hs, self.params):
new_h = h + T.square(g)
if p.apply_reg: # apply to W but not b
new_p = p - self.global_lr / T.sqrt(new_h) * \
(g + self.global_weight_decay * p)
else:
new_p = p - self.global_lr / T.sqrt(new_h) * g
updates.append((h, new_h))
updates.append((p, new_p))
updates.extend(extra_updates)
self.step = graphutils.make_func(self.input, self.output,
updates=updates, name='AdaGrad step')
# Create init_func to init h from one gradient evaluation
updates = []
for g, h in zip(self.grads, self.hs):
new_h = h + T.square(g)
updates.append((h, new_h))
self.init_func = graphutils.make_func(self.input, [], updates=updates,
name='AdaGrad initialiser')
def __call__(self, *args):
if not self._init_done:
self.init_func(*args)
self._init_done = True
return super(AdaGrad, self).__call__(*args)
[docs] def repair_fuckup(self):
super(AdaGrad, self).repair_fuckup()
self.clear_last_dir(self.hs)
self._init_done = False
[docs]class AdaDelta(Optimiser):
# Like AdaGrad, but accumulate squared only over windo
# The delta part is some diagonal hessian approximation
# Claims to be robust agains suddon large gradients because then the
# denominator explodes, but this explosion is persistent for a while...
# (and this argumentation is true for any method accumulating squared grads)
def __init__(self, inputs, loss, grads, params, extra_updates,
additional_outputs=None):
super(AdaDelta, self).__init__(inputs, loss, grads, params,
additional_outputs)
self.squared_accum = self.alloc_shared_grads("_sq") # last directions update
self.delta_accum = self.alloc_shared_grads("_d") # last directions update
epsilon = 1e-5
updates = []
for g, s, d, p in zip(self.grads, self.squared_accum,
self.delta_accum, self.params):
new_s = self.global_mom * s + (1.0 - self.global_mom) * T.square(g)
direction = (g * T.sqrt(d + epsilon) / T.sqrt(s + epsilon))
new_d = self.global_mom * d + (1 - self.global_mom) * T.square(direction)
if p.apply_reg:
if p.apply_reg > 1:
multiplier = graphutils.as_floatX(p.apply_reg)
new_p = p - self.global_lr * \
(direction + self.global_weight_decay * p * multiplier)
else:
new_p = p - self.global_lr * \
(direction + self.global_weight_decay * p)
else:
new_p = p - self.global_lr * direction
updates.append((s, new_s))
updates.append((d, new_d))
updates.append((p, new_p))
updates.extend(extra_updates)
updates.extend(self.get_rotational_updates())
self.step = graphutils.make_func(self.input, self.output,
updates=updates, name='AdaDelta step')
[docs] def repair_fuckup(self):
super(AdaDelta, self).repair_fuckup()
self.clear_last_dir(self.squared_accum)
self.clear_last_dir(self.delta_accum)
[docs]class Adam(Optimiser):
# Like AdaGrad with windowed squared_accum and with momentum and a bias for the initial phase (t)
# The normalisation of Adam and AdaGrad (and RMSProp) does not damp but
# exaggerate sudden steep gradients (their squared_accum is small and their current grad is large)
def __init__(self, inputs, loss, grads, params, extra_updates,
additional_outputs=None):
super(Adam, self).__init__(inputs, loss, grads, params,
additional_outputs)
self.squared_accum = self.alloc_shared_grads("_sq") # last directions update
self.momentum = self.alloc_shared_grads("_m") # last directions update
epsilon = 1e-5
# self.beta1 = variables.VariableParam(value=0.9, name='beta1',
# dtype=graphutils.floatX)
self.beta2 = variables.VariableParam(value=0.999, name='beta2',
dtype=graphutils.floatX)
#self.meta_params['beta1'] = self.beta1
self.meta_params['beta2'] = self.beta2
t_old = variables.VariableParam(value=0.0, name='beta2',
dtype=graphutils.floatX)
updates = []
t = 1 + t_old
updates.append((t_old, t))
factor = T.sqrt(1-self.beta2**t)/(1-self.global_mom**t)
for g, s, m, p in zip(self.grads, self.squared_accum,
self.momentum, self.params):
new_m = self.global_mom * m + (1.0 - self.global_mom) * g
new_s = self.beta2 * s + (1.0 - self.beta2) * T.square(g)
direction = factor * new_m / T.sqrt(new_s + epsilon)
if p.apply_reg:
if p.apply_reg > 1:
multiplier = graphutils.as_floatX(p.apply_reg)
new_p = p - self.global_lr * \
(direction + self.global_weight_decay * p * multiplier)
else:
new_p = p - self.global_lr * \
(direction + self.global_weight_decay * p)
else:
new_p = p - self.global_lr * direction
updates.append((s, new_s))
updates.append((m, new_m))
updates.append((p, new_p))
updates.extend(extra_updates)
updates.extend(self.get_rotational_updates())
self.step = graphutils.make_func(self.input, self.output,
updates=updates, name='Adam step')
[docs] def repair_fuckup(self):
super(Adam, self).repair_fuckup()
self.clear_last_dir(self.squared_accum)
self.clear_last_dir(self.momentum)
[docs]class CG(Optimiser):
pass