Source code for elektronn2.neuromancer.neural

# -*- coding: utf-8 -*-
# ELEKTRONN2 Toolkit
# Copyright (c) 2015 Marius Killinger and Philipp J. Schubert
# All rights reserved

from __future__ import absolute_import, division, print_function
from builtins import filter, hex, input, int, map, next, oct, pow, range, super, zip

import logging
import time
from functools import reduce

import numpy as np
import theano
import theano.tensor as T

from ..config import config
from . import computations
from .variables import VariableWeight, ConstantParam, VariableParam
from .graphutils import floatX, TaggedShape, as_floatX
from .node_basic import Node, Concat

logger = logging.getLogger('elektronn2log')

__all__ = ['Perceptron', 'Conv', 'UpConv', 'Crop', 'LSTM',
           'FragmentsToDense', 'Pool', 'Dot', 'FaithlessMerge',
           'GRU', 'LRN', 'ImageAlign', 'UpConvMerge']

################################################################################

### TODO gradnet stuff anpassen von Conv layer in den anderen layers? Vlt nicht jetzt...

class NeuralLayer(Node):
    """
    Dummy class to add parameter initialisation methods for neural layers.
    """
    def _register_param(self, param, shape, name, init_kwargs=None,
                       apply_train=False, apply_reg=False):
        """
        Create parameter, set parameter as attribute and add to self.params if
        not shared from another Layer.

        Parameters
        ----------
        param: None or np.ndarray or T.Variable or list
            Possible forms of ``param``:
            * Passing ``None`` creates new parameter with default
              initialisation.
            * Passing a np.ndarray creates new parameter with the values
              of the array as initialisation.
            * A shared parameter is created by passing a T.Variable as
              ``param``.
            * A constant parameter is created by passing [np.ndarray, 'const']
              as ``param``.
              This parameter cannot be changed (no set_value) but makes the
              compiled function faster.
        shape: tuple
            Shape of the new the parameter (VariableWeight).
        name: str
            Parameter name.
        init_kwargs
            kwargs for utils.initialisation.
        apply_train: bool
            Train flag of the new parameter (VariableWeight).
        apply_reg: bool
            Regularisation flag of the new parameter (VariableWeight).
        """
        add_to_params = True
        if self.name=='':
            p_name = '<%s%s>'%(name, tuple(shape))
        else:
            p_name = '<%s_%s%s>'%(self.name, name, tuple(shape))
        # create new trainable by initialistaion
        if param is None:
            p = VariableWeight(shape=shape,
                               init_kwargs=init_kwargs,
                               name=p_name,
                               apply_train=apply_train,
                               apply_reg=apply_reg,)

        # create new trainable from values
        elif isinstance(param, np.ndarray):
            if param.shape!=tuple(shape):
                if not (param.ndim==0 and shape==(1,)):
                    raise ValueError("Shape mismatch. Required %s, given %s"\
                                 %(shape, param.shape))
            p = VariableWeight(value=param,
                               name=p_name,
                               apply_train=apply_train,
                               apply_reg=apply_reg,
                               dtype=floatX)

        # share a variable from elsewhere, not trainable
        elif isinstance(param, T.Variable): # (elektronn2.tensor.variables are T.Variable)
            try:
                sh = param.get_value().shape
                if sh!=tuple(shape):
                    raise ValueError("Shape mismatch. Required %s, given %s" \
                                     % (shape, param.shape))
            except AttributeError:
                logger.warning("Could not check correct shape of given weight %s, "
                               "make sure it has shape %s" %(param, shape))
            p = param
            add_to_params = False

        # create constant variable (or explicitly trainable
        elif isinstance(param, (list, tuple)):
            fail = False
            if not isinstance(param[0] , np.ndarray):
                fail = True
            if param[0].shape!=tuple(shape):
                raise ValueError("Shape mismatch. Required %s, given %s"\
                                 %(shape, param[0].shape))
            if param[1] == 'const':
                value = as_floatX(param[0])
                p = ConstantParam(value, p_name)
            elif param[1] == 'trainable':
                value = as_floatX(param[0])
                p = VariableWeight(value=value,
                                   name=p_name,
                                   apply_train=True,
                                   apply_reg=apply_reg)
            else:
                fail = True

            if fail:
                raise ValueError("If a parameter is passed as a list, the"
                                 "first entry must contain the parameter"
                                 "value (np.ndarray) and the second entry"
                                 "must be either 'const' or 'trainable'"
                                 "to indicate whether this param is "
                                 "trainable. Got [%s, %s]" \
                                 %(type(param[0]), param[1]))
        else:
            raise ValueError("Parameter %s must be either <np.ndarray>, "
                             "<theano.TensorVariable>, a tuple or None"
                             "(to create new param)" %(name,))

        setattr(self, name, p) #
        if add_to_params:
            self.params[name] = p
        else:
            logger.debug("Sharing theano variable %s. This parameter is not added to self.params" %(p,))


    def _setup_params(self, w_sh, w, b, gamma, mean, std, dropout_rate,
                      pool_shape=None, gradnet_rate=None):
        """
        Register each parameter, choose appropriate initialisation.
        """
        # Dot/Conv/Bias Parameters #############################################
        self.w = None

        # TODO: Pass w_init mode from layer to setup_params
        if config.use_ortho_init or isinstance(self, GRU) or isinstance(self, LSTM):
            w_init = dict(scale='glorot', mode='ortho', pool=pool_shape,
                          spatial_axes=self.spatial_axes)
        else:
            w_init = dict(scale='glorot', mode='normal', pool=pool_shape,
                          spatial_axes=self.spatial_axes)

        self._register_param(w, w_sh, 'w', init_kwargs=w_init,
                             apply_train=True, apply_reg=True)

        activation_func = self.activation_func
        n_f = self.n_f
        self.b = None
        if isinstance(self, GRU):
            b_sh=(3 * n_f, )
        elif isinstance(self, LSTM):
            b_sh = (4 * n_f, )
        else:
            b_sh=(n_f,)
        if activation_func=='relu' or activation_func.startswith("maxout"):
            norm = 1.0
            if len(w_sh) > 2:
                fov = 1
                for i in self.spatial_axes:
                    fov = fov * w_sh[i]
                norm = fov

            b_init=dict(scale=1.0/norm, mode='const')

        elif activation_func=='sigmoid':
            b_init=dict(scale=0.5, mode='const')
        elif activation_func=='prelu':
            norm = 1.0
            if len(w_sh) > 2:
                fov = 1
                for i in self.spatial_axes:
                    fov = fov * w_sh[i]
                norm = fov

            b_init=dict(scale=1.0/norm, mode='prelu')
            if isinstance(self, GRU):
                 b_sh=(3 * n_f, 2)
            elif isinstance(self, LSTM):
                b_sh = (4 * n_f, 2)
            else:
                b_sh=(n_f, 2)
        else: # all other activations
            b_init=dict(scale=1e-6, mode='fix-uni')
        self._register_param(b, b_sh, 'b', init_kwargs=b_init,
                            apply_train=True, apply_reg=False)

        # Batch Normalisation ##################################################
        batch_normalisation = self.batch_normalisation
        if batch_normalisation in ['train', 'fadeout']:
            # mean and std are created as TensorVariables in _calc_output
            self.gamma = None
            sh = (n_f,)
            g_init =dict(scale=1.0, mode='const')
            self._register_param(gamma, sh, 'gamma', init_kwargs=g_init,
                                apply_train=True, apply_reg=3.0) ###TODO maybe even stronger reg for this?
            if mean is not None or std is not None:
                raise ValueError("Cannot pass mean and std for training, they"
                                 "are computed in the theano graph.")

            # create mean and std for training to accumulate running avgs
            self.mean = None
            m_init =dict(scale=0.0, mode='const')
            self._register_param(None, sh, 'mean', init_kwargs=m_init)

            self.std = None
            s_init =dict(scale=1.0, mode='const')
            self._register_param(None, sh, 'std', init_kwargs=s_init)

        elif batch_normalisation=='predict':
            sh = (n_f,)
            self.gamma = None
            g_init =dict(scale=1.0, mode='const')
            self._register_param(gamma, sh, 'gamma', init_kwargs=g_init)

            self.mean = None
            m_init =dict(scale=0.0, mode='const')
            self._register_param(mean, sh, 'mean', init_kwargs=m_init)

            self.std = None
            s_init =dict(scale=1.0, mode='const')
            self._register_param(std, sh, 'std', init_kwargs=s_init)
        else:
            if batch_normalisation is not False:
                raise ValueError("Unknown value %s for batchnormalisation" %batch_normalisation)

        # Dropout ##############################################################
        self.dropout_rate = None
        if dropout_rate:
            value = as_floatX(dropout_rate)
            self._register_param(value, (1,), 'dropout_rate')


        # GradNet ##############################################################
        self.gradnet_rate = None
        if gradnet_rate:
            value = as_floatX(gradnet_rate)
            self._register_param(value, (1,), 'gradnet_rate')

###############################################################################

[docs]class Perceptron(NeuralLayer): """ Perceptron Layer. Parameters ---------- parent: Node or list of Node The input node(s). n_f: int Number of filters (nodes) in layer. activation_func: str Activation function name. flatten: bool batch_normalisation: str or None Batch normalisation mode. Can be False (inactive), "train" or "fadeout". dropout_rate: float Dropout rate (probability that a node drops out in a training step). name: str Perceptron name. print_repr: bool Whether to print the node representation upon initialisation. w: np.ndarray or T.TensorVariable Weight matrix. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). b: np.ndarray or T.TensorVariable Bias vector. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). gamma (For batch normalisation) Initializes gamma parameter. mean (For batch normalisation) Initializes mean parameter. std (For batch normalisation) Initializes std parameter. gradnet_mode """ # TODO: Write docs on batch normalisation modes. # TODO: gradnet_mode seems to be unused. Should it be removed? def __init__(self, parent, n_f, activation_func='relu', flatten=False, batch_normalisation=False, dropout_rate=0, name="dot", print_repr=True, w=None, b=None, gamma=None, mean=None, std=None, gradnet_mode=None): super(Perceptron, self).__init__(parent, name, print_repr) self.n_f = n_f self.activation_func = activation_func self.batch_normalisation = batch_normalisation self.gradnet_mode = gradnet_mode self.axis = parent.shape.tag2index('f') #retrieve feature shape's index self.flatten = flatten self.spatial_axes = parent.shape.spatial_axes if flatten: n_in = parent.shape.stripbatch_prod else: n_in = parent.shape['f'] #retrieve feature shape w_sh = (n_in, n_f) self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate) def _make_output(self): """ Computation of Theano output. """ if self.flatten: if self.axis is not 1: raise NotImplementedError("Cannot flatten tensor for " "PerceptronLayer when batchsize is " "not on first axis") input_tensor = self.parent.output.flatten(2) pattern = ['x', 0] else: input_tensor = self.parent.output pattern = ['x' for i in input_tensor.shape] pattern[self.axis] = 0 activation_func = self.activation_func if activation_func.startswith("maxout"): r=int(activation_func.split(" ")[1]) assert r>=2 self.n_f /= r if activation_func=='prelu': b = self.b[:,0].dimshuffle(pattern) b1 = self.b[:,1].dimshuffle(pattern) else: b = self.b.dimshuffle(pattern) b1 = None lin_output = computations.dot(input_tensor, self.w, self.axis) if self.batch_normalisation in ['train', 'fadeout']: mean = computations.apply_except_axis( lin_output,self.axis, T.mean).dimshuffle(pattern) std = computations.apply_except_axis( lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6 gamma = self.gamma.dimshuffle(pattern) if self.batch_normalisation=='fadeout': logger.warning("Batch Normalisation mode 'fadeout' does not " "work for less than 50%%...") mean = self.gradnet_rate * mean std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0 gamma = self.gradnet_rate * gamma self.mean.updates = (self.mean, 0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean)) self.std.updates = (self.std, 0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std)) elif self.batch_normalisation=='predict': mean = self.mean.dimshuffle(pattern) std = self.std.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) else: mean = 0 std = 1 gamma = 1 lin_output = (gamma / std) * lin_output + b - (gamma * mean / std) lin_output = computations.apply_activation(lin_output, activation_func, b1) if self.dropout_rate: rng = T.shared_randomstreams.RandomStreams(int(time.time())) p = 1 - self.dropout_rate dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p, dtype=theano.config.floatX) dropout_gate *= 1.0 / p lin_output = lin_output * dropout_gate.dimshuffle(pattern) self.output = lin_output def _calc_shape(self): """ Calculate shape from parent shape and n_f and set it as self.shape. """ sh = self.parent.shape if self.flatten: self.shape = TaggedShape((sh['b'], self.n_f), 'b,f') else: self.shape = sh.updateshape('f', self.n_f) def _calc_comp_cost(self): """ Calculate abstract computational cost from parent shape and n_f and set it as self.computational_cost. """ n = self.parent.shape.stripnone_prod self.computational_cost = n * self.n_f
[docs] def make_dual(self, parent, share_w=False, **kwargs): """ Create the inverse of this ``Perceptron``. Most options are the same as for the layer itself. If ``kwargs`` are not specified, the values of the primal layers are re-used and new parameters are created. Parameters ---------- parent: Node The input node. share_w: bool If the weights (``w``) should be shared from the primal layer. kwargs: dict kwargs that are passed through to the constructor of the inverted Perceptron (see signature of ``Perceptron``). ``n_f`` is copied from the existing node on which ``make_dual`` is called. Every other parameter can be changed from the original ``Perceptron``'s defaults by specifying it in ``kwargs``. Returns ------- Perceptron The inverted perceptron layer. """ if self.flatten: raise NotImplementedError("Cannot make dual Layer for flattened" "Perceptron Layer.") dropout_rate = 0.0 if not self.dropout_rate else self.dropout_rate.get_value() defaults = dict(activation_func=self.activation_func, batch_normalisation=self.batch_normalisation, dropout_rate=dropout_rate, name=self.name+'.T', print_repr=self._print_repr, w=None, b=None,gamma=None, mean=None, std=None) defaults.update(kwargs) kwargs = defaults if share_w: if kwargs['w'] is not None: logger.debug("Ignoring passed w because w is shared from primal Layer.") kwargs['w'] = self.w.T n_f = self.parent.shape['f'] # This is the output of the dual Layer if self.n_f is not parent.shape['f']: # input of dual Layer #q: Shouldn't this be "!=", instead of "is not"? raise ValueError("Cannot make dual Layer of:\n" "%s \n" "with input: %s! \n" "The output shape of the input for the dual Layer" "must match the the input shape of the primal Layer."\ %(self, parent)) return Perceptron(parent, n_f, **kwargs)
def __repr__(self): s = super(NeuralLayer, self).__repr__() s += "\n" s += " n_f=%i, " %(self.n_f,) s += "act='%s', " %(self.activation_func,) if self.flatten: s += "input was flattened, " if self.dropout_rate: s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value()) if self.batch_normalisation: s += "BN in '%s' mode "%(self.batch_normalisation,) return s
Dot = Perceptron ###############################################################################
[docs]class Conv(Perceptron): """ Convolutional layer with subsequent pooling. Examples -------- Examples for constructing convolutional neural networks can be found in examples/3d_cnn.py and examples/numa_mnist.py. Parameters ---------- parent: Node The input node. n_f: int Number of features. filter_shape: tuple Shape of the convolution filter kernels. pool_shape: tuple Size/shape of pooling after the convolution. conv_mode: str Possible values: * "valid": only apply filter to complete patches of the image. Generates output of shape: image_shape -filter_shape + 1. * "full" zero-pads image to multiple of filter shape to generate output of shape: image_shape + filter_shape - 1. activation_func: str Activation function name. mfp: bool Whether to apply Max-Fragment-Pooling in this Layer. batch_normalisation: str or None Batch normalisation mode. Can be False (inactive), "train" or "fadeout". dropout_rate: float Dropout rate (probability that a node drops out in a training step). name: str Layer name. print_repr: bool Whether to print the node representation upon initialisation. w: np.ndarray or T.TensorVariable Weight matrix. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). b: np.ndarray or T.TensorVariable Bias vector. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). gamma (For batch normalisation) Initializes gamma parameter. mean (For batch normalisation) Initializes mean parameter. std (For batch normalisation) Initializes std parameter. gradnet_mode """ def __init__(self, parent, n_f, filter_shape, pool_shape, conv_mode='valid', activation_func='relu', mfp=False, batch_normalisation=False, dropout_rate=0, name="conv", print_repr=True, w=None, b=None, gamma=None, mean=None, std=None, gradnet_mode=None): super(Perceptron, self).__init__(parent, name, print_repr) self.n_f = n_f self.filter_shape = filter_shape self.pool_shape = pool_shape self.conv_mode = conv_mode self.activation_func = activation_func self.batch_normalisation = batch_normalisation self.gradnet_mode = gradnet_mode self.mfp = mfp self.strides = parent.shape.strides self.mfp_offsets = parent.shape.mfp_offsets self.axis = parent.shape.tag2index('f') #retrieve feature shape's index self.axis_order = None self.spatial_axes = self.parent.shape.spatial_axes conv_dim = len(self.spatial_axes) x_dim = len(self.parent.shape) if len(self.spatial_axes)!=len(filter_shape) or \ len(filter_shape)!=len(pool_shape): raise ValueError("The filter_shape dimensionality (%i), the number " "of spatial dimensions in the input (%i) and " "the dimensionality of pool_shape (%i) differ! " "Use filtersize 1 on axes which should not be " "convolved."\ %(len(filter_shape), conv_dim, len(pool_shape))) n_in = parent.shape['f'] #retrieve feature shape fail = False if conv_dim==1: if x_dim!=3 or self.spatial_axes!=[2]: fail = True w_sh = [n_f, n_in] + list(filter_shape) elif conv_dim==2: if x_dim!=4 or self.spatial_axes!=[2,3]: fail = True w_sh = [n_f, n_in] + list(filter_shape) elif conv_dim==3: if x_dim!=5: fail = True if self.spatial_axes==[2,3,4]: self.axis_order = 'dnn' w_sh = [n_f, n_in] + list(filter_shape) elif self.spatial_axes==[1,3,4]: self.axis_order = 'theano' w_sh = [n_f, filter_shape[0], n_in] + list(filter_shape[1:]) else: fail = True if fail: raise NotImplementedError("Cannot convolve non-standard shapes / axis orders," "implement reshaping before conv" "and re-reshaping afer!") self.conv_dim = conv_dim self.w_sh = w_sh gradnet_rate = 1.0 if gradnet_mode else None self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate, pool_shape, gradnet_rate) def _make_output(self): """ Computation of Theano output. """ input_tensor = self.parent.output input_shape = list(self.parent.shape) pattern = ['x' for i in input_tensor.shape] pattern[self.axis] = 0 activation_func = self.activation_func if activation_func.startswith("maxout"): r=int(activation_func.split(" ")[1]) assert r>=2 self.filter_shape /= r if activation_func=='prelu': b = self.b[:,0].dimshuffle(pattern) b1 = self.b[:,1].dimshuffle(pattern) else: b = self.b.dimshuffle(pattern) b1 = None lin_output = computations.conv(input_tensor, self.w, self.axis_order, border_mode=self.conv_mode, x_shape=input_shape, w_shape=self.w_sh) if self.mfp: if self.input_nodes[0].shape['b']!=1: raise ValueError("For MFP the batchsize of the raw image input must be 1") lin_output, offsets_new, strides_new = computations.fragmentpool(lin_output, self.pool_shape, self.mfp_offsets, self.strides, self.spatial_axes) self.mfp_offsets = offsets_new self.strides = strides_new else: lin_output = computations.pooling(lin_output, self.pool_shape, self.spatial_axes) self.strides = np.multiply(self.pool_shape, self.strides) if self.batch_normalisation in ['train', 'fadeout']: mean = computations.apply_except_axis( lin_output,self.axis, T.mean).dimshuffle(pattern) std = computations.apply_except_axis( lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6 gamma = self.gamma.dimshuffle(pattern) if self.batch_normalisation=='fadeout': logger.warning("Batch Normalisation mode 'fadeout' does not " "work for less than 50%%...") mean = self.gradnet_rate * mean std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0 gamma = self.gradnet_rate * gamma self.mean.updates = (self.mean, 0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean)) self.std.updates = (self.std, 0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std)) elif self.batch_normalisation=='predict': mean = self.mean.dimshuffle(pattern) std = self.std.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) else: mean = 0 std = 1 gamma = 1 lin_output = (gamma / std) * lin_output + b - (gamma * mean / std) lin_output = computations.apply_activation(lin_output, activation_func, b1) if self.dropout_rate: rng = T.shared_randomstreams.RandomStreams(int(time.time())) p = 1 - self.dropout_rate dropout_gate = rng.binomial(size=lin_output.shape, n=1, p=p, dtype=theano.config.floatX) dropout_gate *= 1.0 / p lin_output *= dropout_gate #.dimshuffle(('x', 0)) self.output = lin_output def _calc_shape(self): """ Calculate and set self.shape. """ sh = self.parent.shape for j,(i,f,p) in enumerate(zip(self.spatial_axes, self.filter_shape, self.pool_shape)): if self.conv_mode=='valid': k = 1 - f elif self.conv_mode=='full': k = f - 1 elif self.conv_mode=='same': k = 0 s = (sh[i] + k)//p if self.mfp: if (sh[i] + k - p + 1)%p!=0: raise ValueError("Cannot pool spatial axis '%s' of length %i " "by factor %i, after convolving with" "kernel of size %i and using MFP."\ %(sh.tags[i], sh[i], p, f)) else: if (sh[i] + k)%p!=0: raise ValueError("Cannot pool spatial axis '%s' of length %i " "by factor %i, after convolving with" "kernel of size %i."\ %(sh.tags[i], sh[i], p, f)) sh = sh.updateshape(i, s) if sh.fov[j]>0: fov = sh.fov[j] + (f+p-2) * sh.strides[j] else: fov = -1 sh = sh.updatefov(j, fov) if self.mfp: sh = sh.updatemfp_offsets(self.mfp_offsets) sh = sh.updateshape('b', np.prod(self.pool_shape), mode='mult') sh = sh.updatestrides(self.strides) sh = sh.updateshape('f', self.n_f) self.shape = sh def _calc_comp_cost(self): """ Calculate and set self.computational_cost. """ sh = self.parent.shape n_position = 1 for i,f,p in zip(self.spatial_axes, self.filter_shape, self.pool_shape): s = 1 - f if self.conv_mode=='valid' else f -1 n_position *= sh[i] + s b = 1 if sh['b'] is None else sh['b'] self.computational_cost = np.product(self.w_sh) * n_position * b
[docs] def make_dual(self, parent, share_w=False, mfp=False, **kwargs): """ Create the inverse (``UpConv``) of this ``Conv`` node. Most options are the same as for the layer itself. If ``kwargs`` are not specified, the values of the primal layers are re-used and new parameters are created. Parameters ---------- parent: Node The input node. share_w: bool If the weights (``w``) should be shared from the primal layer. mfp: bool If max-fragment-pooling is used. kwargs: dict kwargs that are passed through to the new ``UpConv`` node (see signature of ``UpConv``). ``n_f`` and ``pool_shape`` are copied from the existing node on which ``make_dual`` is called. Every other parameter can be changed from the original ``Conv``'s defaults by specifying it in ``kwargs``. Returns ------- UpConv The inverted conv layer (as an ``UpConv`` node). """ if mfp: parent = FragmentsToDense(parent, print_repr=False) dropout_rate = 0.0 if not self.dropout_rate else self.dropout_rate.get_value() defaults = dict(conv_mode='valid', activation_func=self.activation_func, batch_normalisation=self.batch_normalisation, dropout_rate=dropout_rate, name=self.name+'.T', print_repr=self._print_repr, w=None, b=None,gamma=None, mean=None, std=None) defaults.update(kwargs) kwargs = defaults if share_w: if kwargs['w'] is not None: logger.debug("Ignoring passed w because w is shared from primal Layer.") w = self.w # Exchange n_in and n_f swap = (0,2) if (self.conv_dim==3 and self.axis_order=='theano') else (0,1) w = T.swapaxes(w, *swap) kwargs['w'] = w n_f = self.parent.shape['f'] # This is the output of the dual Layer if self.w_sh[0] is not parent.shape['f']: # input of dual Layer raise ValueError("Cannot make dual Layer of:\n" "%s \n" "with input: %s! \n" "The output shape of the input for the dual Layer" "must match the the input shape of the primal Layer."\ %(self, parent)) return UpConv(parent, n_f, self.pool_shape, **kwargs)
def __repr__(self): s = super(NeuralLayer, self).__repr__() s += "\n" s += " n_f=%i, " %(self.n_f,) s += "%id conv, kernel=%s, pool=%s, "\ %(self.conv_dim, self.filter_shape, self.pool_shape) s += "act='%s', " %(self.activation_func,) if self.dropout_rate: s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value()) if self.batch_normalisation: s += "BN in '%s' mode "%(self.batch_normalisation,) if self.mfp: s += "MFP active, " return s
###############################################################################
[docs]class FragmentsToDense(Node): def __init__(self, parent, name="to_dense", print_repr=True): super(FragmentsToDense, self).__init__(parent, name, print_repr) def _make_output(self): """ Computation of Theano output. """ fragments = self.parent.output sh = self.parent.shape if sh['b']!=len(sh.mfp_offsets) or sh['b']!=np.prod(sh.strides): raise ValueError("Need %i fragments on the batch axis. " "Is MFP active at all?" %np.prod(sh.strides)) self.output = computations.fragments2dense(fragments, sh.mfp_offsets, sh.strides, sh.spatial_axes) def _calc_shape(self): """ Calculate and set self.shape. """ sh = self.parent.shape for ax, st in zip(sh.spatial_axes, sh.strides): sh = sh.updateshape(ax, st, mode='mult') sh = sh.updateshape('b', 1) new_strides = np.ones(len(sh.spatial_axes), np.int) new_offsets = np.zeros((1,len(sh.spatial_axes)), np.int) self.shape = TaggedShape(sh.shape, sh.tags, new_strides, new_offsets, sh.fov) def _calc_comp_cost(self): """ Calculate and set self.computational_cost. For this Node type this is hard-coded to 0. """ self.computational_cost = 0
############################################################################### ###############################################################################
[docs]class UpConv(Conv): """ Upconvolution layer. E.g. pooling + upconv with p=3: x x x x x x x x x before pooling (not in this layer) \|/ \|/ \|/ pooling (not in this layer) x x x input to this layer 0 0 x 0 0 x 0 0 x 0 0 unpooling + padding (done in this layer) /|\ /|\ /|\ conv on unpooled (done in this layer) y y y y y y y y y result of this layer Parameters ---------- parent: Node The input node. n_f: int Number of filters (nodes) in layer. pool_shape: tuple Size/shape of pooling. activation_func: str Activation function name. identity_init: bool Initialise weights to result in pixel repetition upsampling batch_normalisation: str or None Batch normalisation mode. Can be False (inactive), "train" or "fadeout". dropout_rate: float Dropout rate (probability that a node drops out in a training step). name: str Layer name. print_repr: bool Whether to print the node representation upon initialisation. w: np.ndarray or T.TensorVariable Weight matrix. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). b: np.ndarray or T.TensorVariable Bias vector. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). gamma (For batch normalisation) Initializes gamma parameter. mean (For batch normalisation) Initializes mean parameter. std (For batch normalisation) Initializes std parameter. gradnet_mode """ # TODO: The explanation at the top of the docstring is interpreted by sphinx and looks weird (see http://elektronn2.readthedocs.io/en/latest/source/elektronn2.neuromancer.html#elektronn2.neuromancer.neural.UpConv). def __init__(self, parent, n_f, pool_shape, activation_func='relu', identity_init=True, batch_normalisation=False, dropout_rate=0, name="upconv", print_repr=True, w=None, b=None, gamma=None, mean=None, std=None, gradnet_mode=None): filter_shape = pool_shape super(UpConv, self).__init__(parent, n_f, filter_shape, pool_shape, 'valid', activation_func, mfp=False, batch_normalisation=batch_normalisation, dropout_rate=dropout_rate, name=name, print_repr=print_repr, w=w, b=b, gamma=gamma, mean=mean, std=std, gradnet_mode=gradnet_mode) if identity_init: try: w_val = self.w.get_value() * 0.1 s = np.minimum(w_val.shape[0], w_val.shape[1]) s = np.arange(s) w_val[s,s] = 1.0 self.w.set_value(w_val) self.b.set_value(self.b.get_value()*0.0) except: logger.warn("identity_init failed") def _make_output(self): """ Computation of Theano output. """ input_tensor = self.parent.output input_shape = list(self.parent.shape) pattern = ['x' for i in input_tensor.shape] pattern[self.axis] = 0 activation_func = self.activation_func if activation_func.startswith("maxout"): r=int(activation_func.split(" ")[1]) assert r>=2 self.filter_shape /= r if activation_func=='prelu': b = self.b[:,0].dimshuffle(pattern) b1 = self.b[:,1].dimshuffle(pattern) else: b = self.b.dimshuffle(pattern) b1 = None spax = self.spatial_axes pool = np.array(self.pool_shape) input_shape_up = np.array(input_shape) if len(spax)==3 and not computations.dnn_avail: unpooled = computations.unpooling(input_tensor, self.pool_shape, self.spatial_axes) self._debug_outputs.append(unpooled) input_shape_up[spax] = input_shape_up[spax] * pool + pool - 1 input_shape_up = list(input_shape_up) lin_output = computations.conv(unpooled, self.w, self.axis_order, border_mode=self.conv_mode, x_shape=input_shape_up, w_shape=self.w_sh) else: input_shape_up[spax] = input_shape_up[spax] * pool input_shape_up = list(input_shape_up) w = T.swapaxes(self.w, 0, 1) w_sh = list(self.w_sh) w_sh[0], w_sh[1] = w_sh[1], w_sh[0] lin_output = computations.upconv(input_tensor, w, self.pool_shape, x_shape=input_shape_up, w_shape=w_sh, axis_order='dnn') if self.batch_normalisation in ['train', 'fadeout']: mean = computations.apply_except_axis( lin_output,self.axis, T.mean).dimshuffle(pattern) std = computations.apply_except_axis( lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6 gamma = self.gamma.dimshuffle(pattern) if self.batch_normalisation=='fadeout': logger.warning("Batch Normalisation mode 'fadeout' does not " "work for less than 50%%...") mean = self.gradnet_rate * mean std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0 gamma = self.gradnet_rate * gamma self.mean.updates = (self.mean, 0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean)) self.std.updates = (self.std, 0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std)) elif self.batch_normalisation=='predict': mean = self.mean.dimshuffle(pattern) std = self.std.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) else: mean = 0 std = 1 gamma = 1 lin_output = (gamma / std) * lin_output + b - (gamma * mean / std) lin_output = computations.apply_activation(lin_output, activation_func, b1) if self.dropout_rate: rng = T.shared_randomstreams.RandomStreams(int(time.time())) p = 1 - self.dropout_rate dropout_gate = rng.binomial(size=lin_output.shape, n=1, p=p, dtype=theano.config.floatX) dropout_gate *= 1.0 / p lin_output *= dropout_gate #.dimshuffle(('x', 0)) self.output = lin_output def _calc_shape(self): """ Calculate and set self.shape. """ self.strides = np.divide(self.strides,self.pool_shape) sh = self.parent.shape for j,(i,f,p) in enumerate(zip(self.spatial_axes, self.filter_shape, self.pool_shape)): s = 1 - f if self.conv_mode=='valid' else f -1 s = (sh[i] * p) + p - 1 + s # unpool with margin then apply conv sh = sh.updateshape(i, s) # Unpooling creates asymmetric FOV (left/right is different for # some neurons), therefore we flag the FOV as exceptional with '-1' sh = sh.updatefov(j, -1) sh = sh.updateshape('f', self.n_f) sh = sh.updatestrides(self.strides) self.shape = sh def _calc_comp_cost(self): """ Calculate and set self.computational_cost. """ sh = self.parent.shape n_position = 1 for i,f,p in zip(self.spatial_axes, self.filter_shape, self.pool_shape): s = 1 - f if self.conv_mode=='valid' else f -1 n_position *= (sh[i] * p) + s b = 1 if sh['b'] is None else sh['b'] self.computational_cost = np.product(self.w_sh) * n_position * b def __repr__(self): s = super(NeuralLayer, self).__repr__() s += "\n" s += " n_f=%i, " %(self.n_f,) s += "%id upconv, kernel=%s, pool=%s, "\ %(self.conv_dim, self.filter_shape, self.pool_shape) s += "act='%s', " %(self.activation_func,) if self.dropout_rate: s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value()) if self.batch_normalisation: s += "BN in '%s' mode "%(self.batch_normalisation,) return s
[docs] def make_dual(self, *args, **kwargs): raise NotImplementedError("Use Conv instead?")
[docs]class Crop(Node): """ This node type crops the output of its parent. Parameters ---------- parent: Node The input node whose output should be cropped. crop: tuple or list of ints Crop each spatial axis from either side by this number. name: str Node name. print_repr: bool Whether to print the node representation upon initialisation. """ # TODO: Write an example def __init__(self, parent, crop, name="crop", print_repr=False): super(Crop, self).__init__(parent, name, print_repr) self.crop=crop def _make_output(self): """ Computation of Theano output. """ # It is assumed that all other dimensions are matching cropper = [] k = 0 for i,s in enumerate(self.parent.shape): if i in self.parent.shape.spatial_axes: off = self.crop[k] cropper.append(slice(off, s-off)) k += 1 else: cropper.append(slice(None)) cropper = tuple(cropper) self.output = self.parent.output[cropper] def _calc_shape(self): """ Calculate and set self.shape. """ sh = self.parent.shape.copy() k = 0 for i,s in enumerate(self.parent.shape): if i in self.parent.shape.spatial_axes: off = self.crop[k] sh = sh.updateshape(i,s-2*off) k += 1 self.shape = sh def _calc_comp_cost(self): """ Calculate and set self.computational_cost. For this Node type this is hard-coded to 0. """ self.computational_cost = 0
# TODO: Maybe write a complete expample config that demonstrates its usage. --> axon/mkilling/investigation/MA-TEX/CNN-Timings/DS-3-2-unet2d.py
[docs]def ImageAlign(hi_res, lo_res, hig_res_n_f, activation_func='relu', identity_init=True, batch_normalisation=False, dropout_rate=0, name="upconv", print_repr=True, w=None, b=None, gamma=None, mean=None, std=None, gradnet_mode=None): """ Try to automatically align and concatenate a high-res and a low-res convolution output of two branches of a CNN by applying UpConv and Crop to make their shapes and strides compatible. UpConv is used if the low-res Node's strides are at least twice as large as the strides of the high-res Node in any dimension. This function can be used to simplify creation of e.g. architectures similar to U-Net (see https://arxiv.org/abs/1505.04597). If a ValueError that the shapes cannot be aligned is thrown, you can try changing the filter shapes and pooling factors of the (grand-)parent Nodes or add/remove Convolutions and Crops in the preceding branches until the error disappears (of course you should try to keep those changes as minimal as possible). (This function is an alias for UpConvMerge.) Parameters ---------- hi_res: Node Parent Node with high resolution output. lo_res: Node Parent Node with low resolution output. hig_res_n_f: int Number of filters for the aligning UpConv. activation_func: str (passed to new UpConv if required). identity_init: bool (passed to new UpConv if required). batch_normalisation: bool (passed to new UpConv if required). dropout_rate: float (passed to new UpConv if required). name: str Name of the intermediate UpConv node if required. print_repr: bool Whether to print the node representation upon initialisation. w (passed to new UpConv if required). b (passed to new UpConv if required). gamma (passed to new UpConv if required). mean (passed to new UpConv if required). std (passed to new UpConv if required). gradnet_mode (passed to new UpConv if required). Returns ------- Concat Concat Node that merges the aligned high-res and low-res outputs. """ ###TODO exchange UpConv and Crop to save computation in some cases sh_hi = hi_res.shape sh_lo = lo_res.shape assert len(sh_hi)==len(sh_lo) assert sh_hi.spatial_axes == sh_lo.spatial_axes unpool = sh_lo.strides // sh_hi.strides if np.any(unpool>1): lo_res = UpConv(lo_res, hig_res_n_f, unpool, activation_func=activation_func, identity_init=identity_init, batch_normalisation=batch_normalisation, dropout_rate=dropout_rate, name=name, print_repr=print_repr, w=w, b=b, gamma=gamma, mean=mean, std=std, gradnet_mode=gradnet_mode) # No both have same stride # Shapes may have changed sh_hi = hi_res.shape.spatial_shape sh_lo = lo_res.shape.spatial_shape crop_lo = [] crop_hi = [] for i in range(len(sh_hi)): diff = sh_hi[i] - sh_lo[i] # different in orignal space if diff % 2!=0: raise ValueError("hi_res and lo_res maps cannot" "be aligned with shapes:\n%s\n%s" % (sh_hi,sh_lo)) if diff > 0: crop_hi.append(diff // 2 ) crop_lo.append(0) else: crop_lo.append(-diff // 2) crop_hi.append(0) if np.any(crop_lo): lo_res = Crop(lo_res, crop_lo, print_repr=True) if np.any(crop_hi): hi_res = Crop(hi_res, crop_hi, print_repr=True) out = Concat((lo_res, hi_res), axis='f', name='merge', print_repr=True) return out
UpConvMerge = ImageAlign
[docs]class Pool(Node): """ Pooling layer. Reduces the count of training parameters by reducing the spatial size of its input by the factors given in ``pool_shape``. Pooling modes other than max-pooling can only be selected if cuDNN is available. Parameters ---------- parent: Node The input node. pool_shape: tuple Tuple of pooling factors (per dimension) by which the input is downsampled. stride: tuple Stride sizes (per dimension). mfp: bool If max-fragment-pooling should be used. mode: str (only if cuDNN is available) Mode can be any of the modes supported by Theano's dnn_pool(): ('max', 'average_inc_pad', 'average_exc_pad', 'sum'). name: str Name of the pooling layer. print_repr: bool Whether to print the node representation upon initialisation. """ def __init__(self, parent, pool_shape, stride=None, mfp=False, mode='max', name="pool", print_repr=True): super(Pool, self).__init__(parent, name, print_repr) if mfp and stride is not None: raise ValueError("Cannot use custom stride and MFP together") if stride is None: stride = pool_shape self.pool_shape = pool_shape self.pool_stride = stride self.mfp = mfp self.mode = mode self.strides = parent.shape.strides self.mfp_offsets = parent.shape.mfp_offsets self.axis = parent.shape.tag2index('f') #retrieve feature shape's index self.axis_order = None spatial_axes = self.parent.shape.spatial_axes conv_dim = len(pool_shape) x_dim = len(self.parent.shape) n_in = parent.shape['f'] #retrieve feature shape fail = False if conv_dim==1: if x_dim!=3 or spatial_axes!=[2]: fail = True elif conv_dim==2: if x_dim!=4 or spatial_axes!=[2,3]: fail = True elif conv_dim==3: if x_dim!=5: fail = True if spatial_axes==[2,3,4]: self.axis_order = 'dnn' elif spatial_axes==[1,3,4]: self.axis_order = 'theano' else: fail = True if fail: raise NotImplementedError("Cannot convolve non-standard shapes / axis orders," "implement reshaping before conv" "and re-reshaping afer!") self.spatial_axes = spatial_axes self.conv_dim = conv_dim def _make_output(self): """ Computation of Theano output. """ input_tensor = self.parent.output pattern = ['x' for i in input_tensor.shape] pattern[self.axis] = 0 if self.mfp: assert self.pool_stride == self.pool_shape if self.input_nodes[0].shape['b']!=1: raise ValueError("For MFP the batchsize of the raw image input must be 1") lin_output, offsets_new, strides_new = computations.fragmentpool(input_tensor, self.pool_shape, self.mfp_offsets, self.strides, self.spatial_axes, mode=self.mode) self.mfp_offsets = offsets_new self.strides = strides_new else: lin_output = computations.pooling(input_tensor,self.pool_shape, self.spatial_axes, stride=self.pool_stride, mode=self.mode) self.strides = np.multiply(self.pool_stride, self.strides) self.output = lin_output def _calc_shape(self): """ Calculate and set self.shape. """ sh = self.parent.shape for j,(i,p,st) in enumerate(zip(self.spatial_axes , self.pool_shape, self.pool_stride)): tmp = sh[i] - p + st - 1 s = tmp//st + 1 if self.mfp: raise NotImplementedError("Check this first before use") if (tmp - p + 1)%st!=0: raise ValueError("Cannot donwsample spatial axis '%s' of length %i " "by factor %i with pool %i, and using MFP."\ %(sh.tags[i], sh[i], st, p)) else: if (tmp+1)%st!=0: raise ValueError("Cannot donwsample spatial axis '%s' of length %i " "by factor %i with pool %i."\ %(sh.tags[i], sh[i], st, p )) sh = sh.updateshape(i, s) if sh.fov[j]>0: fov = sh.fov[j] + (p-1) * sh.strides[j] else: fov = -1 sh = sh.updatefov(j, fov) if self.mfp: sh = sh.updatemfp_offsets(self.mfp_offsets) sh = sh.updateshape('b', np.prod(self.pool_shape), mode='mult') sh = sh.updatestrides(self.strides) self.shape = sh
[docs]class FaithlessMerge(Node): """ FaithlessMerge node. Parameters ---------- hard_features: Node easy_features: Node axis failing_prob: float The higher the more often merge is unreliable hardeasy_ratio: float The higher the more often the harder features fail instead of the easy ones name: str Name of the pooling layer. print_repr: bool Whether to print the node representation upon initialisation. """ def __init__(self, hard_features, easy_features, axis='f', failing_prob=0.5, hardeasy_ratio=0.8, name="faithless_merge", print_repr=True): parent_nodes = (hard_features, easy_features) super(FaithlessMerge, self).__init__(parent_nodes, name, print_repr) if isinstance(axis, str): self.axis = parent_nodes[0].shape.tag2index(axis) else: self.axis = axis failing_prob = VariableParam(value=failing_prob, name="failing_prob", dtype=floatX, apply_train=False) hardeasy_ratio = VariableParam(value=hardeasy_ratio, name="hardeasy_ratio", dtype=floatX, apply_train=False) self.params['failing_prob'] = failing_prob self.params['hardeasy_ratio'] = hardeasy_ratio self.failing_prob = failing_prob self.hardeasy_ratio = hardeasy_ratio def _make_output(self): """ Computation of Theano output. """ # It is assumed that all other dimensions are matching rng = T.shared_randomstreams.RandomStreams(int(time.time())) size = [1,] * self.parent[0].output.ndim axes = list(range(self.parent[0].output.ndim)) not_failing = rng.binomial(size=size, n=1, p=self.failing_prob, dtype=theano.config.floatX) not_failing = T.addbroadcast(not_failing, *axes) hard_fails = rng.binomial(size=size, n=1, p=1-self.hardeasy_ratio, dtype=theano.config.floatX) hard_fails = T.addbroadcast(hard_fails, *axes) hard = self.parent[0].output * (1 - hard_fails * not_failing) easy = self.parent[1].output * (1 - (1 - hard_fails) * not_failing) self.output = T.concatenate([hard, easy], axis=self.axis) def _calc_shape(self): """ Calculate and set self.shape. """ joint_axis_size = reduce(lambda x, y: x + y.shape[self.axis], self.parent, 0) # assuming all other dimensions are equal sh = self.parent[0].shape.updateshape(self.axis, joint_axis_size) self.shape = sh def _calc_comp_cost(self): """ Calculate and set self.computational_cost. For this Node type this is hard-coded to 0. """ self.computational_cost = 0
[docs]class GRU(NeuralLayer): """ Gated Recurrent Unit Layer. Parameters ---------- parent: Node The input node. memory_state: Node Memory node. n_f: int Number of features. activation_func: str Activation function name. flatten: bool (Unsupported). batch_normalisation: str or None Batch normalisation mode. Can be False (inactive), "train" or "fadeout". dropout_rate: float Dropout rate (probability that a node drops out in a training step). name: str Layer name. print_repr: bool Whether to print the node representation upon initialisation. w: np.ndarray or T.TensorVariable (Unsupported). Weight matrix. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). b: np.ndarray or T.TensorVariable (Unsupported). Bias vector. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). gamma (For batch normalisation) Initializes gamma parameter. mean (For batch normalisation) Initializes mean parameter. std (For batch normalisation) Initializes std parameter. gradnet_mode """ def __init__(self, parent, memory_state, n_f, activation_func='tanh', flatten=False, batch_normalisation=False, dropout_rate=0, name="gru", print_repr=True, w=None, b=None, gamma=None, mean=None, std=None, gradnet_mode=None): parent_nodes = (parent, memory_state) super(GRU, self).__init__(parent_nodes, name, print_repr) self.n_f = n_f self.n_f_memory = memory_state.shape['f'] self.activation_func = activation_func self.batch_normalisation = batch_normalisation self.gradnet_mode = gradnet_mode self.axis = parent.shape.tag2index('f') #retrieve feature shape's index self.spatial_axes = parent.shape.spatial_axes self.flatten = flatten if flatten: raise NotImplementedError("Flatten is not yet supported for GRU.") n_in = parent.shape.stripbatch_prod else: n_in = parent.shape['f'] if self.n_f_memory != n_f: raise ValueError("n_f_memory != n_f not possible") if parent.shape.hastag('r'): raise ValueError("Input must not have 'r' axis") n_comb = self.n_f_memory + n_in if w != None or b != None: raise NotImplementedError("Initial weights are not yet supported for GRU.") w_sh = (n_comb, 3*n_f) # [h_t-1, x] x [W_z/x, W_r/x, W_h/x] self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate) def _make_output(self): """ Computation of Theano output. """ parent = self.parent[0].output memory = self.parent[1].output pattern = ['x' for i in parent.shape] pattern[self.axis] = 0 broad_caster_shape = list(parent.shape) broad_caster_shape[self.axis] = self.n_f_memory broad_caster = T.ones(broad_caster_shape, dtype=memory.dtype) memory = memory * broad_caster input_tensor = T.concatenate([memory, parent] , axis=self.axis) activation_func = self.activation_func if activation_func.startswith("maxout"): r=int(activation_func.split(" ")[1]) assert r>=2 self.n_f /= r if activation_func=='prelu': b = self.b[:-self.n_f,0].dimshuffle(pattern) b_h = self.b[-self.n_f:,0].dimshuffle(pattern) b1 = self.b[:-self.n_f,1].dimshuffle(pattern) b1_h = self.b[-self.n_f:,1].dimshuffle(pattern) else: b = self.b[:-self.n_f].dimshuffle(pattern) b_h = self.b[-self.n_f:].dimshuffle(pattern) b1 = None b1_h = None lin_output = computations.dot(input_tensor, self.w[:, :-self.n_f], self.axis) if self.batch_normalisation in ['train', 'fadeout']: raise NotImplementedError("Batch normalisation not yet supported for GRU.") mean = computations.apply_except_axis( lin_output,self.axis, T.mean).dimshuffle(pattern) std = computations.apply_except_axis( lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6 gamma = self.gamma.dimshuffle(pattern) if self.batch_normalisation=='fadeout': logger.warning("Batch Normalisation mode 'fadeout' does not " "work for less than 50%%...") mean = self.gradnet_rate * mean std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0 gamma = self.gradnet_rate * gamma self.mean.updates = (self.mean, 0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean)) self.std.updates = (self.std, 0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std)) elif self.batch_normalisation=='predict': raise NotImplementedError("Batch normalisation not yet supported for GRU.") mean = self.mean.dimshuffle(pattern) std = self.std.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) else: mean = 0 std = 1 gamma = 1 lin_output = (gamma / std) * lin_output + b - (gamma * mean / std) act = computations.apply_activation(lin_output, 'sig', b1) slice_obj = [slice(None) for i in range(act.ndim)] slice_obj[self.axis] = slice(0, self.n_f) z = act[slice_obj] slice_obj[self.axis] = slice(self.n_f, None) r = act[slice_obj] gated_input = T.concatenate([r*memory, parent], axis=self.axis) h_tilde = computations.dot(gated_input, self.w[:, -self.n_f:], self.axis) h_tilde = (gamma / std) * h_tilde + b_h - (gamma * mean / std) h_tilde = computations.apply_activation(h_tilde, activation_func, b1_h) act = (1 - z) * memory + z * h_tilde self._debug_outputs = [memory, act, z, r,] if self.dropout_rate: raise NotImplementedError("Dropout not yet supported for GRU.") rng = T.shared_randomstreams.RandomStreams(int(time.time())) p = 1 - self.dropout_rate dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p, dtype=theano.config.floatX) dropout_gate *= 1.0 / p act = act * dropout_gate.dimshuffle(('x', 0)) self.output = act def _calc_shape(self): """ Calculate and set self.shape. """ sh = self.parent[0].shape if self.flatten: self.shape = TaggedShape((sh['b'], self.n_f), 'b,f') else: self.shape = sh.updateshape('f', self.n_f) def _calc_comp_cost(self): """ Calculate and set self.computational_cost. """ n = self.parent[0].shape.stripnone_prod self.computational_cost = 3 * n * self.n_f
[docs]class LSTM(NeuralLayer): """ Long short term memory layer. Using an implementation without peepholes in f, i, o, i.e. weights cell state is not taken into account for weights. See http://colah.github.io/posts/2015-08-Understanding-LSTMs/. Parameters ---------- parent: Node The input node. memory_states: Node Concatenated (initial) feed-back and cell state (one Node!). n_f: int Number of features. activation_func: str Activation function name. flatten batch_normalisation: str or None Batch normalisation mode. Can be False (inactive), "train" or "fadeout". dropout_rate: float Dropout rate (probability that a node drops out in a training step). name: str Layer name. print_repr: bool Whether to print the node representation upon initialisation. w: np.ndarray or T.TensorVariable Weight matrix. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). b: np.ndarray or T.TensorVariable Bias vector. If this is a np.ndarray, its values are used to initialise a shared variable for this layer. If it is a T.TensorVariable, it is directly used (weight sharing with the layer which this variable comes from). gamma (For batch normalisation) Initializes gamma parameter. mean (For batch normalisation) Initializes mean parameter. std (For batch normalisation) Initializes std parameter. gradnet_mode """ def __init__(self, parent, memory_states, n_f, activation_func='tanh', flatten=False, batch_normalisation=False, dropout_rate=0, name="lstm", print_repr=True, w=None, b=None, gamma=None, mean=None, std=None, gradnet_mode=None): parent_nodes = (parent, memory_states) super(LSTM, self).__init__(parent_nodes, name, print_repr) self.n_f = n_f self.n_f_memory = memory_states.shape['f'] self.activation_func = activation_func self.batch_normalisation = batch_normalisation self.gradnet_mode = gradnet_mode self.axis = parent.shape.tag2index('f') #retrieve feature shape's index self.spatial_axes = parent.shape.spatial_axes self.flatten = flatten if flatten: raise NotImplementedError("Flatten is not yet supported for LSTM.") else: n_in = parent.shape['f'] n_comb = n_f + n_in if w != None or b != None: raise NotImplementedError("Initial weights are not yet supported for LSTM.") if self.n_f_memory != 2*n_f: raise ValueError("n_f of memory_states must be 2*n_f!") if parent.shape.hastag('r'): raise ValueError("Input must not have 'r' axis") w_sh = (n_comb, 4*n_f) # f, i, o, C self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate) def _make_output(self): """ Computation of Theano output. """ parent = self.parent[0].output memory = self.parent[1].output broad_caster_shape = list(parent.shape) broad_caster_shape[self.axis] = self.n_f_memory broad_caster = T.ones(broad_caster_shape, dtype=memory.dtype) memory = memory * broad_caster slice_obj = [slice(None) for i in range(len(self.parent[1].shape))] slice_obj[self.parent[1].shape.tag2index('f')] = slice(self.n_f) feed_back = memory[slice_obj] slice_obj[self.parent[1].shape.tag2index('f')] = slice(self.n_f, None) cell_state = memory[slice_obj] input_tensor = T.concatenate([feed_back, parent] , axis=self.axis) #h, x pattern = ['x' for i in input_tensor.shape] pattern[self.axis] = 0 activation_func = self.activation_func if activation_func.startswith("maxout"): r=int(activation_func.split(" ")[1]) assert r>=2 self.n_f /= r if activation_func=='prelu': b = self.b[:, 0].dimshuffle(pattern) b1 = self.b[:, 1] b1_f = b1[:self.n_f].dimshuffle(pattern) b1_i = b1[self.n_f:2*self.n_f].dimshuffle(pattern) b1_o = b1[2*self.n_f:3*self.n_f].dimshuffle(pattern) b1_c = b1[3*self.n_f:].dimshuffle(pattern) else: b = self.b.dimshuffle(pattern) b1_f = None b1_i = None b1_o = None b1_c = None lin_output = computations.dot(input_tensor, self.w, self.axis) if self.batch_normalisation in ['train', 'fadeout']: raise NotImplementedError("Batch normalisation not yet supported for LSTM.") mean = computations.apply_except_axis( lin_output,self.axis, T.mean).dimshuffle(pattern) std = computations.apply_except_axis( lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6 gamma = self.gamma.dimshuffle(pattern) if self.batch_normalisation=='fadeout': logger.warning("Batch Normalisation mode 'fadeout' does not " "work for less than 50%%...") mean = self.gradnet_rate * mean std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0 gamma = self.gradnet_rate * gamma self.mean.updates = (self.mean, 0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean)) self.std.updates = (self.std, 0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std)) elif self.batch_normalisation=='predict': raise NotImplementedError("Batch normalisation not yet supported for LSTM.") mean = self.mean.dimshuffle(pattern) std = self.std.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) else: mean = 0 std = 1 gamma = 1 lin_output = (gamma / std) * lin_output + b - (gamma * mean / std) slice_obj = [slice(None) for i in range(lin_output.ndim)] slice_obj[self.axis] = slice(self.n_f) f = computations.apply_activation(lin_output[slice_obj], 'sig', b1_f) slice_obj[self.axis] = slice(self.n_f, 2*self.n_f) i = computations.apply_activation(lin_output[slice_obj], 'sig', b1_i) slice_obj[self.axis] = slice(2*self.n_f, 3*self.n_f) o = computations.apply_activation(lin_output[slice_obj], 'sig', b1_o) slice_obj[self.axis] = slice(3*self.n_f, 4*self.n_f) c_tilde = computations.apply_activation(lin_output[slice_obj], activation_func, b1_c) cell_out = f * cell_state + i * c_tilde lin_output = o * computations.apply_activation(cell_out, activation_func, None) if self.dropout_rate: raise NotImplementedError("Dropout not yet supported for LSTM.") rng = T.shared_randomstreams.RandomStreams(int(time.time())) p = 1 - self.dropout_rate dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p, dtype=theano.config.floatX) dropout_gate *= 1.0 / p lin_output = lin_output * dropout_gate.dimshuffle(('x', 0)) self.output = T.concatenate([lin_output, cell_out], axis=self.axis) def _calc_shape(self): """ Calculate and set self.shape. """ sh = self.parent[0].shape if self.flatten: self.shape = TaggedShape((sh['b'], 2*self.n_f), 'b,f') else: self.shape = sh.updateshape('f',2* self.n_f) def _calc_comp_cost(self): """ Calculate and set self.computational_cost. """ n = self.parent[0].shape.stripnone_prod self.computational_cost = 4 * n * self.n_f
[docs]class LRN(Node): """ LRN (Local Response Normalization) layer. Parameters ---------- parent: Node The input node. filter_shape: tuple mode: str Can be "spatial" or "channel". alpha: float k: float beta: float name: str Node name. print_repr: bool Whether to print the node representation upon initialisation. """ def __init__(self, parent, filter_shape, mode='spatial', alpha=0.0001, k=1, beta=0.75, name="LRN", print_repr=True): super(LRN, self).__init__(parent, name, print_repr) self.mode = mode self.filter_shape = filter_shape self.axis = parent.shape.tag2index('f') # retrieve feature shape's index if mode=='spatial': self.axis_order = None self.spatial_axes = self.parent.shape.spatial_axes conv_dim = len(self.spatial_axes) x_dim = len(self.parent.shape) if len(self.spatial_axes)!=len(filter_shape): raise ValueError("The filter_shape dimensionality (%i) and the number" "of spatial dimensions in the input (%i)differ!" "Use filtersize 1 on axes which should not be" "averaged."\ %(len(filter_shape), conv_dim, )) n_in = parent.shape['f'] #retrieve feature shape fail = False if conv_dim==1: if x_dim!=3 or self.spatial_axes!=[2]: fail = True w_sh = [n_in, n_in] + list(filter_shape) elif conv_dim==2: if x_dim!=4 or self.spatial_axes!=[2,3]: fail = True w_sh = [n_in, n_in] + list(filter_shape) elif conv_dim==3: if x_dim!=5: fail = True if self.spatial_axes==[2,3,4]: self.axis_order = 'dnn' w_sh = [n_in, n_in] + list(filter_shape) elif self.spatial_axes==[1,3,4]: self.axis_order = 'theano' w_sh = [n_in, filter_shape[0], n_in] + list(filter_shape[1:]) else: fail = True if fail: raise NotImplementedError("Cannot convolve non-standard shapes / axis orders," "implement reshaping before conv" "and re-reshaping afer!") self.conv_dim = conv_dim self.w_sh = w_sh value = np.zeros(w_sh, dtype=floatX) val = 1.0 / np.product(filter_shape) for i in range(n_in): value[i,i] = val self.average_filter = ConstantParam(value, '<%s_filter%s>'%(self.name, tuple(w_sh))) self.params['average_filter'] = self.average_filter elif mode=='channel': assert isinstance(filter_shape, int) assert filter_shape%2==1 else: raise ValueError("Unknow mode %s"%mode) self.alpha = VariableParam(value=alpha, name="alpha", dtype=floatX, apply_train=False) self.beta = VariableParam(value=beta,name="beta", dtype=floatX, apply_train=False) self.k = VariableParam(value=k,name="k", dtype=floatX, apply_train=False) self.params['alpha'] = self.alpha self.params['beta'] = self.beta self.params['k'] = self.k def _make_output(self): """ Computation of Theano output. """ input_tensor = self.parent.output input_shape = list(self.parent.shape) if self.mode=='spatial': mean_square = computations.conv(T.square(input_tensor), self.average_filter, self.axis_order, border_mode='same', x_shape=input_shape, w_shape=self.w_sh) else: n_f = input_shape[self.axis] in_square = T.square(input_tensor) half_n = self.filter_shape // 2 new_sh = list(input_tensor.shape) new_sh[self.axis] += 2 * half_n in_square_ext = T.zeros(new_sh, floatX) slicer = [slice(None)] * input_tensor.ndim slicer[self.axis] = slice(half_n,half_n+n_f) in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square) # pad left slicer[self.axis] = slice(0, half_n) pad_slicer = [slice(None)] * input_tensor.ndim pad_slicer[self.axis] = slice(0, 1) in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square[pad_slicer]) # pad right slicer[self.axis] = slice(half_n+n_f, 2*half_n+n_f) pad_slicer[self.axis] = slice(n_f-1,n_f) in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square[pad_slicer]) mean_square = 0 for i in range(self.filter_shape): slicer[self.axis] = slice(i,i+n_f) mean_square += in_square_ext[slicer] mean_square /= self.filter_shape divisor = T.power(self.k + self.alpha * mean_square, self.beta) self.output = input_tensor / divisor self._debug_outputs = [mean_square, divisor]