Source code for elektronn2.neuromancer.optimiser

# -*- coding: utf-8 -*-
# ELEKTRONN2 Toolkit
# Copyright (c) 2015 Marius Killinger
# All rights reserved

from __future__ import absolute_import, division, print_function
from builtins import filter, hex, input, int, map, next, oct, pow, range, super, zip

import logging
import numpy as np
import theano.tensor as T

from . import graphutils
from . import variables

logger = logging.getLogger('elektronn2log')


[docs]class Optimiser(object):
    global_lr = variables.VariableParam(value=1,
                                        name='lr',
                                        dtype=graphutils.floatX)
    global_weight_decay = variables.VariableParam(value=0,
                                                  name='weight_decay',
                                                  dtype=graphutils.floatX)

    global_mom = variables.VariableParam(value=0.9,
                                          name='mom',
                                          dtype=graphutils.floatX)

    @classmethod
[docs]    def setlr(cls, val):
        """
        Set learning rate (global to all optimisers)
        """
        val = graphutils.as_floatX(val)
        cls.global_lr.set_value(val)

    @classmethod
[docs]    def setwd(cls, val):
        """
        Set weight decay parameter (global to all optimisers)
        """
        val = graphutils.as_floatX(val)
        cls.global_weight_decay.set_value(val)

    @classmethod
[docs]    def setmom(cls, val):
        """
        Set momentum parameter (global to all optimisers)
        """
        val = graphutils.as_floatX(val)
        cls.global_mom.set_value(val)

    def __init__(self, inputs, loss, grads, params, additional_outputs):
        if additional_outputs is None:
            additional_outputs = []

        self.meta_params = dict(lr=self.global_lr,
                                mom=self.global_mom,
                                wd=self.global_weight_decay)
        self.input = inputs
        self.output = [loss,] + additional_outputs
        self.loss   = loss
        self.params = params
        self.grads = grads
        self.step = None
        self.last_exec_time = None
        self.last_dir = []
        # the higher the index the older the params
        self.params_cycler = [self.alloc_shared_grads(name_suffix='_lp_%i'%i)
                              for i in range(3)]

[docs]    def alloc_shared_grads(self, name_suffix='_lg', init_val=0.0):
        """Returns new shared variables matching the shape of params/gradients"""
        grads = []
        for i, p in enumerate(self.params):
            name = p.name+name_suffix
            value = np.ones_like(p.get_value()) * graphutils.as_floatX(init_val)
            g = variables.VariableParam(value=value, name=name)
            grads.append(g)

        return grads

[docs]    def set_opt_meta_params(self, value_dict):
        """
        Update the meta-parameters via value dictionary
        """
        for k,v in value_dict.items():
            try:
                self.meta_params[k].set_value(v)
            except AttributeError:
                raise AttributeError

[docs]    def clear_last_dir(self, last_dir=None):
        if last_dir is None:
            last_dir = self.last_dir

        for d in last_dir:
            d.set_value(np.zeros(d.get_value().shape, dtype=d.dtype))


[docs]    def get_rotational_updates(self):
        updates = []
        for x in zip(self.params, *self.params_cycler):
            new_param, param_queue, = x[0], x[1:]
            for i in range(len(param_queue)-1, 0, -1):
                updates.append((param_queue[i], param_queue[i-1]))

            updates.append((param_queue[0], new_param))

        return updates


[docs]    def repair_fuckup(self):
        self.clear_last_dir()
        for p, p_old in zip(self.params, self.params_cycler[-1]):
            p.set_value(p_old.get_value())


    def __call__(self, *args):
        """
        Perform an update step
        [data (,labels etc...)] --> [loss (, add. outputs...)]
        """
        ret = list(self.step(*args))
        ret[0] = graphutils.as_floatX(ret[0]) # the scalar loss
        self.last_exec_time = self.step.last_exec_time
        return ret



###############################################################################

[docs]class SGD(Optimiser):
    def __init__(self,  inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(SGD, self).__init__(inputs, loss, grads, params,
                                  additional_outputs)
        self.last_dir     = self.alloc_shared_grads() # last direction os update

        updates = []
        for g, d, p in zip(self.grads, self.last_dir, self.params):
            new_d = g + self.global_mom * d
            if p.apply_reg:
                if p.apply_reg > 1:
                    multiplier = graphutils.as_floatX(p.apply_reg)
                    new_p = p - self.global_lr * \
                                (new_d + self.global_weight_decay * p * multiplier)
                else:
                    new_p = p - self.global_lr * \
                                (new_d + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr *  new_d

            updates.append((d, new_d))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        updates.extend(self.get_rotational_updates())
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='SGD step')


[docs]class AdaGrad(Optimiser):
    # Tries to favor making faster progress on parameters with usually small
    # gradients (but dos somehow ignore their actual direction, i.e. a parameter
    # which has a lot of small gradients in the same direction and one that has
    # many small gradients in opposite directions have both a high LR !
    def __init__(self, inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(AdaGrad, self).__init__(inputs, loss, grads, params,
                                      additional_outputs)

        self._init_done = False
        self.hs  = self.alloc_shared_grads('_h', init_val=0.0)
        updates = []
        for g, h, p in zip(self.grads, self.hs, self.params):
            new_h = h + T.square(g)
            if p.apply_reg: # apply to W but not b
                new_p = p - self.global_lr / T.sqrt(new_h) * \
                            (g + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr / T.sqrt(new_h) * g

            updates.append((h, new_h))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='AdaGrad step')

        # Create init_func to init h from one gradient evaluation
        updates = []
        for g, h in zip(self.grads, self.hs):
            new_h = h + T.square(g)
            updates.append((h, new_h))

        self.init_func = graphutils.make_func(self.input, [], updates=updates,
                                     name='AdaGrad initialiser')

    def __call__(self, *args):
        if not self._init_done:
            self.init_func(*args)
            self._init_done = True

        return super(AdaGrad, self).__call__(*args)


[docs]    def repair_fuckup(self):
        super(AdaGrad, self).repair_fuckup()
        self.clear_last_dir(self.hs)
        self._init_done = False

[docs]class AdaDelta(Optimiser):
    # Like AdaGrad, but accumulate squared only over windo
    # The delta part is some diagonal hessian approximation
    # Claims to be robust agains suddon large gradients because then the
    # denominator explodes, but this explosion is persistent for a while...
    # (and this argumentation is true for any method accumulating squared grads)
    def __init__(self,  inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(AdaDelta, self).__init__(inputs, loss, grads, params,
                                       additional_outputs)
        self.squared_accum = self.alloc_shared_grads("_sq") # last directions update
        self.delta_accum = self.alloc_shared_grads("_d") # last directions update
        epsilon = 1e-5

        updates = []
        for g, s, d, p in zip(self.grads, self.squared_accum,
                              self.delta_accum, self.params):
            new_s = self.global_mom * s + (1.0 - self.global_mom) * T.square(g)
            direction = (g * T.sqrt(d + epsilon) / T.sqrt(s + epsilon))
            new_d = self.global_mom * d + (1 - self.global_mom) * T.square(direction)
            if p.apply_reg:
                if p.apply_reg > 1:
                    multiplier = graphutils.as_floatX(p.apply_reg)
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p * multiplier)
                else:
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr *  direction

            updates.append((s, new_s))
            updates.append((d, new_d))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        updates.extend(self.get_rotational_updates())
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='AdaDelta step')


[docs]    def repair_fuckup(self):
        super(AdaDelta, self).repair_fuckup()
        self.clear_last_dir(self.squared_accum)
        self.clear_last_dir(self.delta_accum)


[docs]class Adam(Optimiser):
    # Like AdaGrad with windowed squared_accum and with momentum and a bias for the initial phase (t)
    # The normalisation of Adam and AdaGrad (and RMSProp) does not damp but
    # exaggerate sudden steep gradients (their squared_accum is small and their current grad is large)
    def __init__(self,  inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(Adam, self).__init__(inputs, loss, grads, params,
                                   additional_outputs)
        self.squared_accum = self.alloc_shared_grads("_sq") # last directions update
        self.momentum = self.alloc_shared_grads("_m") # last directions update
        epsilon = 1e-5

        # self.beta1 = variables.VariableParam(value=0.9, name='beta1',
        #                                      dtype=graphutils.floatX)
        self.beta2 = variables.VariableParam(value=0.999, name='beta2',
                                             dtype=graphutils.floatX)
        #self.meta_params['beta1'] = self.beta1
        self.meta_params['beta2'] = self.beta2

        t_old = variables.VariableParam(value=0.0, name='beta2',
                                        dtype=graphutils.floatX)

        updates = []
        t = 1 + t_old
        updates.append((t_old, t))
        factor = T.sqrt(1-self.beta2**t)/(1-self.global_mom**t)
        for g, s, m, p in zip(self.grads, self.squared_accum,
                              self.momentum, self.params):
            new_m = self.global_mom * m + (1.0 - self.global_mom) * g
            new_s = self.beta2      * s + (1.0 - self.beta2) * T.square(g)

            direction = factor * new_m / T.sqrt(new_s + epsilon)
            if p.apply_reg:
                if p.apply_reg > 1:
                    multiplier = graphutils.as_floatX(p.apply_reg)
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p * multiplier)
                else:
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr *  direction

            updates.append((s, new_s))
            updates.append((m, new_m))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        updates.extend(self.get_rotational_updates())
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='Adam step')

[docs]    def repair_fuckup(self):
        super(Adam, self).repair_fuckup()
        self.clear_last_dir(self.squared_accum)
        self.clear_last_dir(self.momentum)

[docs]class CG(Optimiser):
    pass