# -*- coding: utf-8 -*-
# ELEKTRONN2 Toolkit
# Copyright (c) 2015 Marius Killinger and Philipp J. Schubert
# All rights reserved
from __future__ import absolute_import, division, print_function
from builtins import filter, hex, input, int, map, next, oct, pow, range, super, zip
import logging
import time
from functools import reduce
import numpy as np
import theano
import theano.tensor as T
from ..config import config
from . import computations
from .variables import VariableWeight, ConstantParam, VariableParam
from .graphutils import floatX, TaggedShape, as_floatX
from .node_basic import Node, Concat
logger = logging.getLogger('elektronn2log')
__all__ = ['Perceptron', 'Conv', 'UpConv', 'Crop', 'LSTM',
'FragmentsToDense', 'Pool', 'Dot', 'FaithlessMerge',
'GRU', 'LRN', 'ImageAlign', 'UpConvMerge']
################################################################################
### TODO gradnet stuff anpassen von Conv layer in den anderen layers? Vlt nicht jetzt...
class NeuralLayer(Node):
"""
Dummy class to add parameter initialisation methods for neural layers.
"""
def _register_param(self, param, shape, name, init_kwargs=None,
apply_train=False, apply_reg=False):
"""
Create parameter, set parameter as attribute and add to self.params if
not shared from another Layer.
Parameters
----------
param: None or np.ndarray or T.Variable or list
Possible forms of ``param``:
* Passing ``None`` creates new parameter with default
initialisation.
* Passing a np.ndarray creates new parameter with the values
of the array as initialisation.
* A shared parameter is created by passing a T.Variable as
``param``.
* A constant parameter is created by passing [np.ndarray, 'const']
as ``param``.
This parameter cannot be changed (no set_value) but makes the
compiled function faster.
shape: tuple
Shape of the new the parameter (VariableWeight).
name: str
Parameter name.
init_kwargs
kwargs for utils.initialisation.
apply_train: bool
Train flag of the new parameter (VariableWeight).
apply_reg: bool
Regularisation flag of the new parameter (VariableWeight).
"""
add_to_params = True
if self.name=='':
p_name = '<%s%s>'%(name, tuple(shape))
else:
p_name = '<%s_%s%s>'%(self.name, name, tuple(shape))
# create new trainable by initialistaion
if param is None:
p = VariableWeight(shape=shape,
init_kwargs=init_kwargs,
name=p_name,
apply_train=apply_train,
apply_reg=apply_reg,)
# create new trainable from values
elif isinstance(param, np.ndarray):
if param.shape!=tuple(shape):
if not (param.ndim==0 and shape==(1,)):
raise ValueError("Shape mismatch. Required %s, given %s"\
%(shape, param.shape))
p = VariableWeight(value=param,
name=p_name,
apply_train=apply_train,
apply_reg=apply_reg,
dtype=floatX)
# share a variable from elsewhere, not trainable
elif isinstance(param, T.Variable): # (elektronn2.tensor.variables are T.Variable)
try:
sh = param.get_value().shape
if sh!=tuple(shape):
raise ValueError("Shape mismatch. Required %s, given %s" \
% (shape, param.shape))
except AttributeError:
logger.warning("Could not check correct shape of given weight %s, "
"make sure it has shape %s" %(param, shape))
p = param
add_to_params = False
# create constant variable (or explicitly trainable
elif isinstance(param, (list, tuple)):
fail = False
if not isinstance(param[0] , np.ndarray):
fail = True
if param[0].shape!=tuple(shape):
raise ValueError("Shape mismatch. Required %s, given %s"\
%(shape, param[0].shape))
if param[1] == 'const':
value = as_floatX(param[0])
p = ConstantParam(value, p_name)
elif param[1] == 'trainable':
value = as_floatX(param[0])
p = VariableWeight(value=value,
name=p_name,
apply_train=True,
apply_reg=apply_reg)
else:
fail = True
if fail:
raise ValueError("If a parameter is passed as a list, the"
"first entry must contain the parameter"
"value (np.ndarray) and the second entry"
"must be either 'const' or 'trainable'"
"to indicate whether this param is "
"trainable. Got [%s, %s]" \
%(type(param[0]), param[1]))
else:
raise ValueError("Parameter %s must be either <np.ndarray>, "
"<theano.TensorVariable>, a tuple or None"
"(to create new param)" %(name,))
setattr(self, name, p) #
if add_to_params:
self.params[name] = p
else:
logger.debug("Sharing theano variable %s. This parameter is not added to self.params" %(p,))
def _setup_params(self, w_sh, w, b, gamma, mean, std, dropout_rate,
pool_shape=None, gradnet_rate=None):
"""
Register each parameter, choose appropriate initialisation.
"""
# Dot/Conv/Bias Parameters #############################################
self.w = None
# TODO: Pass w_init mode from layer to setup_params
if config.use_ortho_init or isinstance(self, GRU) or isinstance(self, LSTM):
w_init = dict(scale='glorot', mode='ortho', pool=pool_shape,
spatial_axes=self.spatial_axes)
else:
w_init = dict(scale='glorot', mode='normal', pool=pool_shape,
spatial_axes=self.spatial_axes)
self._register_param(w, w_sh, 'w', init_kwargs=w_init,
apply_train=True, apply_reg=True)
activation_func = self.activation_func
n_f = self.n_f
self.b = None
if isinstance(self, GRU):
b_sh=(3 * n_f, )
elif isinstance(self, LSTM):
b_sh = (4 * n_f, )
else:
b_sh=(n_f,)
if activation_func=='relu' or activation_func.startswith("maxout"):
norm = 1.0
if len(w_sh) > 2:
fov = 1
for i in self.spatial_axes:
fov = fov * w_sh[i]
norm = fov
b_init=dict(scale=1.0/norm, mode='const')
elif activation_func=='sigmoid':
b_init=dict(scale=0.5, mode='const')
elif activation_func=='prelu':
norm = 1.0
if len(w_sh) > 2:
fov = 1
for i in self.spatial_axes:
fov = fov * w_sh[i]
norm = fov
b_init=dict(scale=1.0/norm, mode='prelu')
if isinstance(self, GRU):
b_sh=(3 * n_f, 2)
elif isinstance(self, LSTM):
b_sh = (4 * n_f, 2)
else:
b_sh=(n_f, 2)
else: # all other activations
b_init=dict(scale=1e-6, mode='fix-uni')
self._register_param(b, b_sh, 'b', init_kwargs=b_init,
apply_train=True, apply_reg=False)
# Batch Normalisation ##################################################
batch_normalisation = self.batch_normalisation
if batch_normalisation in ['train', 'fadeout']:
# mean and std are created as TensorVariables in _calc_output
self.gamma = None
sh = (n_f,)
g_init =dict(scale=1.0, mode='const')
self._register_param(gamma, sh, 'gamma', init_kwargs=g_init,
apply_train=True, apply_reg=3.0) ###TODO maybe even stronger reg for this?
if mean is not None or std is not None:
raise ValueError("Cannot pass mean and std for training, they"
"are computed in the theano graph.")
# create mean and std for training to accumulate running avgs
self.mean = None
m_init =dict(scale=0.0, mode='const')
self._register_param(None, sh, 'mean', init_kwargs=m_init)
self.std = None
s_init =dict(scale=1.0, mode='const')
self._register_param(None, sh, 'std', init_kwargs=s_init)
elif batch_normalisation=='predict':
sh = (n_f,)
self.gamma = None
g_init =dict(scale=1.0, mode='const')
self._register_param(gamma, sh, 'gamma', init_kwargs=g_init)
self.mean = None
m_init =dict(scale=0.0, mode='const')
self._register_param(mean, sh, 'mean', init_kwargs=m_init)
self.std = None
s_init =dict(scale=1.0, mode='const')
self._register_param(std, sh, 'std', init_kwargs=s_init)
else:
if batch_normalisation is not False:
raise ValueError("Unknown value %s for batchnormalisation" %batch_normalisation)
# Dropout ##############################################################
self.dropout_rate = None
if dropout_rate:
value = as_floatX(dropout_rate)
self._register_param(value, (1,), 'dropout_rate')
# GradNet ##############################################################
self.gradnet_rate = None
if gradnet_rate:
value = as_floatX(gradnet_rate)
self._register_param(value, (1,), 'gradnet_rate')
###############################################################################
[docs]class Perceptron(NeuralLayer):
"""
Perceptron Layer.
Parameters
----------
parent: Node or list of Node
The input node(s).
n_f: int
Number of filters (nodes) in layer.
activation_func: str
Activation function name.
flatten: bool
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Perceptron name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initializes gamma parameter.
mean
(For batch normalisation) Initializes mean parameter.
std
(For batch normalisation) Initializes std parameter.
gradnet_mode
""" # TODO: Write docs on batch normalisation modes.
# TODO: gradnet_mode seems to be unused. Should it be removed?
def __init__(self, parent, n_f, activation_func='relu',
flatten=False, batch_normalisation=False, dropout_rate=0,
name="dot", print_repr=True, w=None, b=None, gamma=None,
mean=None, std=None, gradnet_mode=None):
super(Perceptron, self).__init__(parent, name, print_repr)
self.n_f = n_f
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.flatten = flatten
self.spatial_axes = parent.shape.spatial_axes
if flatten:
n_in = parent.shape.stripbatch_prod
else:
n_in = parent.shape['f'] #retrieve feature shape
w_sh = (n_in, n_f)
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
if self.flatten:
if self.axis is not 1:
raise NotImplementedError("Cannot flatten tensor for "
"PerceptronLayer when batchsize is "
"not on first axis")
input_tensor = self.parent.output.flatten(2)
pattern = ['x', 0]
else:
input_tensor = self.parent.output
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.n_f /= r
if activation_func=='prelu':
b = self.b[:,0].dimshuffle(pattern)
b1 = self.b[:,1].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1 = None
lin_output = computations.dot(input_tensor, self.w, self.axis)
if self.batch_normalisation in ['train', 'fadeout']:
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
lin_output = computations.apply_activation(lin_output, activation_func, b1)
if self.dropout_rate:
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output = lin_output * dropout_gate.dimshuffle(pattern)
self.output = lin_output
def _calc_shape(self):
"""
Calculate shape from parent shape and n_f and set it as self.shape.
"""
sh = self.parent.shape
if self.flatten:
self.shape = TaggedShape((sh['b'], self.n_f), 'b,f')
else:
self.shape = sh.updateshape('f', self.n_f)
def _calc_comp_cost(self):
"""
Calculate abstract computational cost from parent shape and n_f and
set it as self.computational_cost.
"""
n = self.parent.shape.stripnone_prod
self.computational_cost = n * self.n_f
[docs] def make_dual(self, parent, share_w=False, **kwargs):
"""
Create the inverse of this ``Perceptron``.
Most options are the same as for the layer itself.
If ``kwargs`` are not specified, the values of the primal
layers are re-used and new parameters are created.
Parameters
----------
parent: Node
The input node.
share_w: bool
If the weights (``w``) should be shared from the primal layer.
kwargs: dict
kwargs that are passed through to the constructor of the inverted
Perceptron (see signature of ``Perceptron``).
``n_f`` is copied from the existing node on
which ``make_dual`` is called.
Every other parameter can be changed from the original
``Perceptron``'s defaults by specifying it in ``kwargs``.
Returns
-------
Perceptron
The inverted perceptron layer.
"""
if self.flatten:
raise NotImplementedError("Cannot make dual Layer for flattened"
"Perceptron Layer.")
dropout_rate = 0.0 if not self.dropout_rate else self.dropout_rate.get_value()
defaults = dict(activation_func=self.activation_func,
batch_normalisation=self.batch_normalisation,
dropout_rate=dropout_rate,
name=self.name+'.T',
print_repr=self._print_repr,
w=None, b=None,gamma=None, mean=None, std=None)
defaults.update(kwargs)
kwargs = defaults
if share_w:
if kwargs['w'] is not None:
logger.debug("Ignoring passed w because w is shared from primal Layer.")
kwargs['w'] = self.w.T
n_f = self.parent.shape['f'] # This is the output of the dual Layer
if self.n_f is not parent.shape['f']: # input of dual Layer #q: Shouldn't this be "!=", instead of "is not"?
raise ValueError("Cannot make dual Layer of:\n"
"%s \n"
"with input: %s! \n"
"The output shape of the input for the dual Layer"
"must match the the input shape of the primal Layer."\
%(self, parent))
return Perceptron(parent, n_f, **kwargs)
def __repr__(self):
s = super(NeuralLayer, self).__repr__()
s += "\n"
s += " n_f=%i, " %(self.n_f,)
s += "act='%s', " %(self.activation_func,)
if self.flatten:
s += "input was flattened, "
if self.dropout_rate:
s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value())
if self.batch_normalisation:
s += "BN in '%s' mode "%(self.batch_normalisation,)
return s
Dot = Perceptron
###############################################################################
[docs]class Conv(Perceptron):
"""
Convolutional layer with subsequent pooling.
Examples
--------
Examples for constructing convolutional neural networks can be found
in examples/3d_cnn.py and examples/numa_mnist.py.
Parameters
----------
parent: Node
The input node.
n_f: int
Number of features.
filter_shape: tuple
Shape of the convolution filter kernels.
pool_shape: tuple
Size/shape of pooling after the convolution.
conv_mode: str
Possible values:
* "valid": only apply filter to complete patches of the image.
Generates output of shape: image_shape -filter_shape + 1.
* "full" zero-pads image to multiple of filter shape to generate
output of shape: image_shape + filter_shape - 1.
activation_func: str
Activation function name.
mfp: bool
Whether to apply Max-Fragment-Pooling in this Layer.
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initializes gamma parameter.
mean
(For batch normalisation) Initializes mean parameter.
std
(For batch normalisation) Initializes std parameter.
gradnet_mode
"""
def __init__(self, parent, n_f, filter_shape, pool_shape,
conv_mode='valid', activation_func='relu',
mfp=False, batch_normalisation=False, dropout_rate=0,
name="conv", print_repr=True, w=None, b=None, gamma=None,
mean=None, std=None, gradnet_mode=None):
super(Perceptron, self).__init__(parent, name, print_repr)
self.n_f = n_f
self.filter_shape = filter_shape
self.pool_shape = pool_shape
self.conv_mode = conv_mode
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.mfp = mfp
self.strides = parent.shape.strides
self.mfp_offsets = parent.shape.mfp_offsets
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.axis_order = None
self.spatial_axes = self.parent.shape.spatial_axes
conv_dim = len(self.spatial_axes)
x_dim = len(self.parent.shape)
if len(self.spatial_axes)!=len(filter_shape) or \
len(filter_shape)!=len(pool_shape):
raise ValueError("The filter_shape dimensionality (%i), the number "
"of spatial dimensions in the input (%i) and "
"the dimensionality of pool_shape (%i) differ! "
"Use filtersize 1 on axes which should not be "
"convolved."\
%(len(filter_shape), conv_dim, len(pool_shape)))
n_in = parent.shape['f'] #retrieve feature shape
fail = False
if conv_dim==1:
if x_dim!=3 or self.spatial_axes!=[2]:
fail = True
w_sh = [n_f, n_in] + list(filter_shape)
elif conv_dim==2:
if x_dim!=4 or self.spatial_axes!=[2,3]:
fail = True
w_sh = [n_f, n_in] + list(filter_shape)
elif conv_dim==3:
if x_dim!=5:
fail = True
if self.spatial_axes==[2,3,4]:
self.axis_order = 'dnn'
w_sh = [n_f, n_in] + list(filter_shape)
elif self.spatial_axes==[1,3,4]:
self.axis_order = 'theano'
w_sh = [n_f, filter_shape[0], n_in] + list(filter_shape[1:])
else:
fail = True
if fail:
raise NotImplementedError("Cannot convolve non-standard shapes / axis orders,"
"implement reshaping before conv"
"and re-reshaping afer!")
self.conv_dim = conv_dim
self.w_sh = w_sh
gradnet_rate = 1.0 if gradnet_mode else None
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate,
pool_shape, gradnet_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
input_shape = list(self.parent.shape)
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.filter_shape /= r
if activation_func=='prelu':
b = self.b[:,0].dimshuffle(pattern)
b1 = self.b[:,1].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1 = None
lin_output = computations.conv(input_tensor, self.w, self.axis_order,
border_mode=self.conv_mode,
x_shape=input_shape, w_shape=self.w_sh)
if self.mfp:
if self.input_nodes[0].shape['b']!=1:
raise ValueError("For MFP the batchsize of the raw image input must be 1")
lin_output, offsets_new, strides_new = computations.fragmentpool(lin_output,
self.pool_shape,
self.mfp_offsets,
self.strides,
self.spatial_axes)
self.mfp_offsets = offsets_new
self.strides = strides_new
else:
lin_output = computations.pooling(lin_output, self.pool_shape, self.spatial_axes)
self.strides = np.multiply(self.pool_shape, self.strides)
if self.batch_normalisation in ['train', 'fadeout']:
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
lin_output = computations.apply_activation(lin_output, activation_func, b1)
if self.dropout_rate:
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=lin_output.shape, n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output *= dropout_gate #.dimshuffle(('x', 0))
self.output = lin_output
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape
for j,(i,f,p) in enumerate(zip(self.spatial_axes, self.filter_shape, self.pool_shape)):
if self.conv_mode=='valid':
k = 1 - f
elif self.conv_mode=='full':
k = f - 1
elif self.conv_mode=='same':
k = 0
s = (sh[i] + k)//p
if self.mfp:
if (sh[i] + k - p + 1)%p!=0:
raise ValueError("Cannot pool spatial axis '%s' of length %i "
"by factor %i, after convolving with"
"kernel of size %i and using MFP."\
%(sh.tags[i], sh[i], p, f))
else:
if (sh[i] + k)%p!=0:
raise ValueError("Cannot pool spatial axis '%s' of length %i "
"by factor %i, after convolving with"
"kernel of size %i."\
%(sh.tags[i], sh[i], p, f))
sh = sh.updateshape(i, s)
if sh.fov[j]>0:
fov = sh.fov[j] + (f+p-2) * sh.strides[j]
else:
fov = -1
sh = sh.updatefov(j, fov)
if self.mfp:
sh = sh.updatemfp_offsets(self.mfp_offsets)
sh = sh.updateshape('b', np.prod(self.pool_shape), mode='mult')
sh = sh.updatestrides(self.strides)
sh = sh.updateshape('f', self.n_f)
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
sh = self.parent.shape
n_position = 1
for i,f,p in zip(self.spatial_axes, self.filter_shape, self.pool_shape):
s = 1 - f if self.conv_mode=='valid' else f -1
n_position *= sh[i] + s
b = 1 if sh['b'] is None else sh['b']
self.computational_cost = np.product(self.w_sh) * n_position * b
[docs] def make_dual(self, parent, share_w=False, mfp=False, **kwargs):
"""
Create the inverse (``UpConv``) of this ``Conv`` node.
Most options are the same as for the layer itself.
If ``kwargs`` are not specified, the values of the primal
layers are re-used and new parameters are created.
Parameters
----------
parent: Node
The input node.
share_w: bool
If the weights (``w``) should be shared from the primal layer.
mfp: bool
If max-fragment-pooling is used.
kwargs: dict
kwargs that are passed through to the new ``UpConv`` node (see
signature of ``UpConv``).
``n_f`` and ``pool_shape`` are copied from the existing node on
which ``make_dual`` is called.
Every other parameter can be changed from the original
``Conv``'s defaults by specifying it in ``kwargs``.
Returns
-------
UpConv
The inverted conv layer (as an ``UpConv`` node).
"""
if mfp:
parent = FragmentsToDense(parent, print_repr=False)
dropout_rate = 0.0 if not self.dropout_rate else self.dropout_rate.get_value()
defaults = dict(conv_mode='valid', activation_func=self.activation_func,
batch_normalisation=self.batch_normalisation,
dropout_rate=dropout_rate,
name=self.name+'.T',
print_repr=self._print_repr,
w=None, b=None,gamma=None, mean=None, std=None)
defaults.update(kwargs)
kwargs = defaults
if share_w:
if kwargs['w'] is not None:
logger.debug("Ignoring passed w because w is shared from primal Layer.")
w = self.w
# Exchange n_in and n_f
swap = (0,2) if (self.conv_dim==3 and self.axis_order=='theano') else (0,1)
w = T.swapaxes(w, *swap)
kwargs['w'] = w
n_f = self.parent.shape['f'] # This is the output of the dual Layer
if self.w_sh[0] is not parent.shape['f']: # input of dual Layer
raise ValueError("Cannot make dual Layer of:\n"
"%s \n"
"with input: %s! \n"
"The output shape of the input for the dual Layer"
"must match the the input shape of the primal Layer."\
%(self, parent))
return UpConv(parent, n_f, self.pool_shape, **kwargs)
def __repr__(self):
s = super(NeuralLayer, self).__repr__()
s += "\n"
s += " n_f=%i, " %(self.n_f,)
s += "%id conv, kernel=%s, pool=%s, "\
%(self.conv_dim, self.filter_shape, self.pool_shape)
s += "act='%s', " %(self.activation_func,)
if self.dropout_rate:
s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value())
if self.batch_normalisation:
s += "BN in '%s' mode "%(self.batch_normalisation,)
if self.mfp:
s += "MFP active, "
return s
###############################################################################
[docs]class FragmentsToDense(Node):
def __init__(self, parent, name="to_dense", print_repr=True):
super(FragmentsToDense, self).__init__(parent, name, print_repr)
def _make_output(self):
"""
Computation of Theano output.
"""
fragments = self.parent.output
sh = self.parent.shape
if sh['b']!=len(sh.mfp_offsets) or sh['b']!=np.prod(sh.strides):
raise ValueError("Need %i fragments on the batch axis. "
"Is MFP active at all?" %np.prod(sh.strides))
self.output = computations.fragments2dense(fragments, sh.mfp_offsets,
sh.strides, sh.spatial_axes)
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape
for ax, st in zip(sh.spatial_axes, sh.strides):
sh = sh.updateshape(ax, st, mode='mult')
sh = sh.updateshape('b', 1)
new_strides = np.ones(len(sh.spatial_axes), np.int)
new_offsets = np.zeros((1,len(sh.spatial_axes)), np.int)
self.shape = TaggedShape(sh.shape, sh.tags, new_strides,
new_offsets, sh.fov)
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
For this Node type this is hard-coded to 0.
"""
self.computational_cost = 0
###############################################################################
###############################################################################
[docs]class UpConv(Conv):
"""
Upconvolution layer.
E.g. pooling + upconv with p=3:
x x x x x x x x x before pooling (not in this layer)
\|/ \|/ \|/ pooling (not in this layer)
x x x input to this layer
0 0 x 0 0 x 0 0 x 0 0 unpooling + padding (done in this layer)
/|\ /|\ /|\ conv on unpooled (done in this layer)
y y y y y y y y y result of this layer
Parameters
----------
parent: Node
The input node.
n_f: int
Number of filters (nodes) in layer.
pool_shape: tuple
Size/shape of pooling.
activation_func: str
Activation function name.
identity_init: bool
Initialise weights to result in pixel repetition upsampling
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initializes gamma parameter.
mean
(For batch normalisation) Initializes mean parameter.
std
(For batch normalisation) Initializes std parameter.
gradnet_mode
""" # TODO: The explanation at the top of the docstring is interpreted by sphinx and looks weird (see http://elektronn2.readthedocs.io/en/latest/source/elektronn2.neuromancer.html#elektronn2.neuromancer.neural.UpConv).
def __init__(self, parent, n_f, pool_shape, activation_func='relu',
identity_init=True, batch_normalisation=False, dropout_rate=0,
name="upconv", print_repr=True, w=None, b=None, gamma=None,
mean=None, std=None, gradnet_mode=None):
filter_shape = pool_shape
super(UpConv, self).__init__(parent, n_f, filter_shape, pool_shape,
'valid', activation_func, mfp=False,
batch_normalisation=batch_normalisation,
dropout_rate=dropout_rate, name=name,
print_repr=print_repr, w=w, b=b,
gamma=gamma, mean=mean, std=std,
gradnet_mode=gradnet_mode)
if identity_init:
try:
w_val = self.w.get_value() * 0.1
s = np.minimum(w_val.shape[0], w_val.shape[1])
s = np.arange(s)
w_val[s,s] = 1.0
self.w.set_value(w_val)
self.b.set_value(self.b.get_value()*0.0)
except:
logger.warn("identity_init failed")
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
input_shape = list(self.parent.shape)
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.filter_shape /= r
if activation_func=='prelu':
b = self.b[:,0].dimshuffle(pattern)
b1 = self.b[:,1].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1 = None
spax = self.spatial_axes
pool = np.array(self.pool_shape)
input_shape_up = np.array(input_shape)
if len(spax)==3 and not computations.dnn_avail:
unpooled = computations.unpooling(input_tensor, self.pool_shape, self.spatial_axes)
self._debug_outputs.append(unpooled)
input_shape_up[spax] = input_shape_up[spax] * pool + pool - 1
input_shape_up = list(input_shape_up)
lin_output = computations.conv(unpooled, self.w, self.axis_order,
border_mode=self.conv_mode,
x_shape=input_shape_up, w_shape=self.w_sh)
else:
input_shape_up[spax] = input_shape_up[spax] * pool
input_shape_up = list(input_shape_up)
w = T.swapaxes(self.w, 0, 1)
w_sh = list(self.w_sh)
w_sh[0], w_sh[1] = w_sh[1], w_sh[0]
lin_output = computations.upconv(input_tensor, w, self.pool_shape,
x_shape=input_shape_up,
w_shape=w_sh,
axis_order='dnn')
if self.batch_normalisation in ['train', 'fadeout']:
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
lin_output = computations.apply_activation(lin_output, activation_func, b1)
if self.dropout_rate:
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=lin_output.shape, n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output *= dropout_gate #.dimshuffle(('x', 0))
self.output = lin_output
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
self.strides = np.divide(self.strides,self.pool_shape)
sh = self.parent.shape
for j,(i,f,p) in enumerate(zip(self.spatial_axes, self.filter_shape, self.pool_shape)):
s = 1 - f if self.conv_mode=='valid' else f -1
s = (sh[i] * p) + p - 1 + s # unpool with margin then apply conv
sh = sh.updateshape(i, s)
# Unpooling creates asymmetric FOV (left/right is different for
# some neurons), therefore we flag the FOV as exceptional with '-1'
sh = sh.updatefov(j, -1)
sh = sh.updateshape('f', self.n_f)
sh = sh.updatestrides(self.strides)
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
sh = self.parent.shape
n_position = 1
for i,f,p in zip(self.spatial_axes, self.filter_shape, self.pool_shape):
s = 1 - f if self.conv_mode=='valid' else f -1
n_position *= (sh[i] * p) + s
b = 1 if sh['b'] is None else sh['b']
self.computational_cost = np.product(self.w_sh) * n_position * b
def __repr__(self):
s = super(NeuralLayer, self).__repr__()
s += "\n"
s += " n_f=%i, " %(self.n_f,)
s += "%id upconv, kernel=%s, pool=%s, "\
%(self.conv_dim, self.filter_shape, self.pool_shape)
s += "act='%s', " %(self.activation_func,)
if self.dropout_rate:
s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value())
if self.batch_normalisation:
s += "BN in '%s' mode "%(self.batch_normalisation,)
return s
[docs] def make_dual(self, *args, **kwargs):
raise NotImplementedError("Use Conv instead?")
[docs]class Crop(Node):
"""
This node type crops the output of its parent.
Parameters
----------
parent: Node
The input node whose output should be cropped.
crop: tuple or list of ints
Crop each spatial axis from either side by this number.
name: str
Node name.
print_repr: bool
Whether to print the node representation upon initialisation.
""" # TODO: Write an example
def __init__(self, parent, crop, name="crop", print_repr=False):
super(Crop, self).__init__(parent, name, print_repr)
self.crop=crop
def _make_output(self):
"""
Computation of Theano output.
"""
# It is assumed that all other dimensions are matching
cropper = []
k = 0
for i,s in enumerate(self.parent.shape):
if i in self.parent.shape.spatial_axes:
off = self.crop[k]
cropper.append(slice(off, s-off))
k += 1
else:
cropper.append(slice(None))
cropper = tuple(cropper)
self.output = self.parent.output[cropper]
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape.copy()
k = 0
for i,s in enumerate(self.parent.shape):
if i in self.parent.shape.spatial_axes:
off = self.crop[k]
sh = sh.updateshape(i,s-2*off)
k += 1
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
For this Node type this is hard-coded to 0.
"""
self.computational_cost = 0
# TODO: Maybe write a complete expample config that demonstrates its usage. --> axon/mkilling/investigation/MA-TEX/CNN-Timings/DS-3-2-unet2d.py
[docs]def ImageAlign(hi_res, lo_res, hig_res_n_f,
activation_func='relu', identity_init=True,
batch_normalisation=False, dropout_rate=0,
name="upconv", print_repr=True, w=None, b=None, gamma=None,
mean=None, std=None, gradnet_mode=None):
"""
Try to automatically align and concatenate a high-res and a low-res
convolution output of two branches of a CNN by applying UpConv and Crop to
make their shapes and strides compatible.
UpConv is used if the low-res Node's strides are at least twice as large
as the strides of the high-res Node in any dimension.
This function can be used to simplify creation of e.g. architectures similar to
U-Net (see https://arxiv.org/abs/1505.04597).
If a ValueError that the shapes cannot be aligned is thrown,
you can try changing the filter shapes and pooling factors of the
(grand-)parent Nodes or add/remove Convolutions and Crops in the preceding
branches until the error disappears (of course you should try to keep
those changes as minimal as possible).
(This function is an alias for UpConvMerge.)
Parameters
----------
hi_res: Node
Parent Node with high resolution output.
lo_res: Node
Parent Node with low resolution output.
hig_res_n_f: int
Number of filters for the aligning UpConv.
activation_func: str
(passed to new UpConv if required).
identity_init: bool
(passed to new UpConv if required).
batch_normalisation: bool
(passed to new UpConv if required).
dropout_rate: float
(passed to new UpConv if required).
name: str
Name of the intermediate UpConv node if required.
print_repr: bool
Whether to print the node representation upon initialisation.
w
(passed to new UpConv if required).
b
(passed to new UpConv if required).
gamma
(passed to new UpConv if required).
mean
(passed to new UpConv if required).
std
(passed to new UpConv if required).
gradnet_mode
(passed to new UpConv if required).
Returns
-------
Concat
Concat Node that merges the aligned high-res and low-res outputs.
"""
###TODO exchange UpConv and Crop to save computation in some cases
sh_hi = hi_res.shape
sh_lo = lo_res.shape
assert len(sh_hi)==len(sh_lo)
assert sh_hi.spatial_axes == sh_lo.spatial_axes
unpool = sh_lo.strides // sh_hi.strides
if np.any(unpool>1):
lo_res = UpConv(lo_res, hig_res_n_f, unpool,
activation_func=activation_func, identity_init=identity_init,
batch_normalisation=batch_normalisation, dropout_rate=dropout_rate,
name=name, print_repr=print_repr, w=w, b=b, gamma=gamma,
mean=mean, std=std, gradnet_mode=gradnet_mode)
# No both have same stride
# Shapes may have changed
sh_hi = hi_res.shape.spatial_shape
sh_lo = lo_res.shape.spatial_shape
crop_lo = []
crop_hi = []
for i in range(len(sh_hi)):
diff = sh_hi[i] - sh_lo[i] # different in orignal space
if diff % 2!=0:
raise ValueError("hi_res and lo_res maps cannot"
"be aligned with shapes:\n%s\n%s" % (sh_hi,sh_lo))
if diff > 0:
crop_hi.append(diff // 2 )
crop_lo.append(0)
else:
crop_lo.append(-diff // 2)
crop_hi.append(0)
if np.any(crop_lo):
lo_res = Crop(lo_res, crop_lo, print_repr=True)
if np.any(crop_hi):
hi_res = Crop(hi_res, crop_hi, print_repr=True)
out = Concat((lo_res, hi_res), axis='f', name='merge', print_repr=True)
return out
UpConvMerge = ImageAlign
[docs]class Pool(Node):
"""
Pooling layer.
Reduces the count of training parameters by reducing the spatial size
of its input by the factors given in ``pool_shape``.
Pooling modes other than max-pooling can only be selected if cuDNN is
available.
Parameters
----------
parent: Node
The input node.
pool_shape: tuple
Tuple of pooling factors (per dimension) by which the input
is downsampled.
stride: tuple
Stride sizes (per dimension).
mfp: bool
If max-fragment-pooling should be used.
mode: str
(only if cuDNN is available)
Mode can be any of the modes supported by Theano's dnn_pool():
('max', 'average_inc_pad', 'average_exc_pad', 'sum').
name: str
Name of the pooling layer.
print_repr: bool
Whether to print the node representation upon initialisation.
"""
def __init__(self, parent, pool_shape, stride=None, mfp=False, mode='max',
name="pool", print_repr=True):
super(Pool, self).__init__(parent, name, print_repr)
if mfp and stride is not None:
raise ValueError("Cannot use custom stride and MFP together")
if stride is None:
stride = pool_shape
self.pool_shape = pool_shape
self.pool_stride = stride
self.mfp = mfp
self.mode = mode
self.strides = parent.shape.strides
self.mfp_offsets = parent.shape.mfp_offsets
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.axis_order = None
spatial_axes = self.parent.shape.spatial_axes
conv_dim = len(pool_shape)
x_dim = len(self.parent.shape)
n_in = parent.shape['f'] #retrieve feature shape
fail = False
if conv_dim==1:
if x_dim!=3 or spatial_axes!=[2]:
fail = True
elif conv_dim==2:
if x_dim!=4 or spatial_axes!=[2,3]:
fail = True
elif conv_dim==3:
if x_dim!=5:
fail = True
if spatial_axes==[2,3,4]:
self.axis_order = 'dnn'
elif spatial_axes==[1,3,4]:
self.axis_order = 'theano'
else:
fail = True
if fail:
raise NotImplementedError("Cannot convolve non-standard shapes / axis orders,"
"implement reshaping before conv"
"and re-reshaping afer!")
self.spatial_axes = spatial_axes
self.conv_dim = conv_dim
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
if self.mfp:
assert self.pool_stride == self.pool_shape
if self.input_nodes[0].shape['b']!=1:
raise ValueError("For MFP the batchsize of the raw image input must be 1")
lin_output, offsets_new, strides_new = computations.fragmentpool(input_tensor,
self.pool_shape,
self.mfp_offsets,
self.strides,
self.spatial_axes,
mode=self.mode)
self.mfp_offsets = offsets_new
self.strides = strides_new
else:
lin_output = computations.pooling(input_tensor,self.pool_shape,
self.spatial_axes, stride=self.pool_stride,
mode=self.mode)
self.strides = np.multiply(self.pool_stride, self.strides)
self.output = lin_output
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape
for j,(i,p,st) in enumerate(zip(self.spatial_axes , self.pool_shape, self.pool_stride)):
tmp = sh[i] - p + st - 1
s = tmp//st + 1
if self.mfp:
raise NotImplementedError("Check this first before use")
if (tmp - p + 1)%st!=0:
raise ValueError("Cannot donwsample spatial axis '%s' of length %i "
"by factor %i with pool %i, and using MFP."\
%(sh.tags[i], sh[i], st, p))
else:
if (tmp+1)%st!=0:
raise ValueError("Cannot donwsample spatial axis '%s' of length %i "
"by factor %i with pool %i."\
%(sh.tags[i], sh[i], st, p ))
sh = sh.updateshape(i, s)
if sh.fov[j]>0:
fov = sh.fov[j] + (p-1) * sh.strides[j]
else:
fov = -1
sh = sh.updatefov(j, fov)
if self.mfp:
sh = sh.updatemfp_offsets(self.mfp_offsets)
sh = sh.updateshape('b', np.prod(self.pool_shape), mode='mult')
sh = sh.updatestrides(self.strides)
self.shape = sh
[docs]class FaithlessMerge(Node):
"""
FaithlessMerge node.
Parameters
----------
hard_features: Node
easy_features: Node
axis
failing_prob: float
The higher the more often merge is unreliable
hardeasy_ratio: float
The higher the more often the harder features fail instead of the easy ones
name: str
Name of the pooling layer.
print_repr: bool
Whether to print the node representation upon initialisation.
"""
def __init__(self, hard_features, easy_features, axis='f', failing_prob=0.5,
hardeasy_ratio=0.8, name="faithless_merge", print_repr=True):
parent_nodes = (hard_features, easy_features)
super(FaithlessMerge, self).__init__(parent_nodes, name, print_repr)
if isinstance(axis, str):
self.axis = parent_nodes[0].shape.tag2index(axis)
else:
self.axis = axis
failing_prob = VariableParam(value=failing_prob,
name="failing_prob",
dtype=floatX,
apply_train=False)
hardeasy_ratio = VariableParam(value=hardeasy_ratio,
name="hardeasy_ratio",
dtype=floatX,
apply_train=False)
self.params['failing_prob'] = failing_prob
self.params['hardeasy_ratio'] = hardeasy_ratio
self.failing_prob = failing_prob
self.hardeasy_ratio = hardeasy_ratio
def _make_output(self):
"""
Computation of Theano output.
"""
# It is assumed that all other dimensions are matching
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
size = [1,] * self.parent[0].output.ndim
axes = list(range(self.parent[0].output.ndim))
not_failing = rng.binomial(size=size, n=1, p=self.failing_prob,
dtype=theano.config.floatX)
not_failing = T.addbroadcast(not_failing, *axes)
hard_fails = rng.binomial(size=size, n=1, p=1-self.hardeasy_ratio,
dtype=theano.config.floatX)
hard_fails = T.addbroadcast(hard_fails, *axes)
hard = self.parent[0].output * (1 - hard_fails * not_failing)
easy = self.parent[1].output * (1 - (1 - hard_fails) * not_failing)
self.output = T.concatenate([hard, easy], axis=self.axis)
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
joint_axis_size = reduce(lambda x, y: x + y.shape[self.axis],
self.parent, 0)
# assuming all other dimensions are equal
sh = self.parent[0].shape.updateshape(self.axis, joint_axis_size)
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
For this Node type this is hard-coded to 0.
"""
self.computational_cost = 0
[docs]class GRU(NeuralLayer):
"""
Gated Recurrent Unit Layer.
Parameters
----------
parent: Node
The input node.
memory_state: Node
Memory node.
n_f: int
Number of features.
activation_func: str
Activation function name.
flatten: bool
(Unsupported).
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
(Unsupported).
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
(Unsupported).
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initializes gamma parameter.
mean
(For batch normalisation) Initializes mean parameter.
std
(For batch normalisation) Initializes std parameter.
gradnet_mode
"""
def __init__(self, parent, memory_state, n_f, activation_func='tanh',
flatten=False, batch_normalisation=False, dropout_rate=0,
name="gru", print_repr=True, w=None, b=None,
gamma=None, mean=None, std=None, gradnet_mode=None):
parent_nodes = (parent, memory_state)
super(GRU, self).__init__(parent_nodes, name, print_repr)
self.n_f = n_f
self.n_f_memory = memory_state.shape['f']
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.spatial_axes = parent.shape.spatial_axes
self.flatten = flatten
if flatten:
raise NotImplementedError("Flatten is not yet supported for GRU.")
n_in = parent.shape.stripbatch_prod
else:
n_in = parent.shape['f']
if self.n_f_memory != n_f:
raise ValueError("n_f_memory != n_f not possible")
if parent.shape.hastag('r'):
raise ValueError("Input must not have 'r' axis")
n_comb = self.n_f_memory + n_in
if w != None or b != None:
raise NotImplementedError("Initial weights are not yet supported for GRU.")
w_sh = (n_comb, 3*n_f) # [h_t-1, x] x [W_z/x, W_r/x, W_h/x]
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
parent = self.parent[0].output
memory = self.parent[1].output
pattern = ['x' for i in parent.shape]
pattern[self.axis] = 0
broad_caster_shape = list(parent.shape)
broad_caster_shape[self.axis] = self.n_f_memory
broad_caster = T.ones(broad_caster_shape, dtype=memory.dtype)
memory = memory * broad_caster
input_tensor = T.concatenate([memory, parent] , axis=self.axis)
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.n_f /= r
if activation_func=='prelu':
b = self.b[:-self.n_f,0].dimshuffle(pattern)
b_h = self.b[-self.n_f:,0].dimshuffle(pattern)
b1 = self.b[:-self.n_f,1].dimshuffle(pattern)
b1_h = self.b[-self.n_f:,1].dimshuffle(pattern)
else:
b = self.b[:-self.n_f].dimshuffle(pattern)
b_h = self.b[-self.n_f:].dimshuffle(pattern)
b1 = None
b1_h = None
lin_output = computations.dot(input_tensor, self.w[:, :-self.n_f], self.axis)
if self.batch_normalisation in ['train', 'fadeout']:
raise NotImplementedError("Batch normalisation not yet supported for GRU.")
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
raise NotImplementedError("Batch normalisation not yet supported for GRU.")
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
act = computations.apply_activation(lin_output, 'sig', b1)
slice_obj = [slice(None) for i in range(act.ndim)]
slice_obj[self.axis] = slice(0, self.n_f)
z = act[slice_obj]
slice_obj[self.axis] = slice(self.n_f, None)
r = act[slice_obj]
gated_input = T.concatenate([r*memory, parent], axis=self.axis)
h_tilde = computations.dot(gated_input, self.w[:, -self.n_f:], self.axis)
h_tilde = (gamma / std) * h_tilde + b_h - (gamma * mean / std)
h_tilde = computations.apply_activation(h_tilde, activation_func, b1_h)
act = (1 - z) * memory + z * h_tilde
self._debug_outputs = [memory, act, z, r,]
if self.dropout_rate:
raise NotImplementedError("Dropout not yet supported for GRU.")
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
act = act * dropout_gate.dimshuffle(('x', 0))
self.output = act
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent[0].shape
if self.flatten:
self.shape = TaggedShape((sh['b'], self.n_f), 'b,f')
else:
self.shape = sh.updateshape('f', self.n_f)
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
n = self.parent[0].shape.stripnone_prod
self.computational_cost = 3 * n * self.n_f
[docs]class LSTM(NeuralLayer):
"""
Long short term memory layer.
Using an implementation without peepholes in f, i, o, i.e. weights
cell state is not taken into account for weights. See
http://colah.github.io/posts/2015-08-Understanding-LSTMs/.
Parameters
----------
parent: Node
The input node.
memory_states: Node
Concatenated (initial) feed-back and cell state (one Node!).
n_f: int
Number of features.
activation_func: str
Activation function name.
flatten
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initializes gamma parameter.
mean
(For batch normalisation) Initializes mean parameter.
std
(For batch normalisation) Initializes std parameter.
gradnet_mode
"""
def __init__(self, parent, memory_states, n_f, activation_func='tanh',
flatten=False, batch_normalisation=False, dropout_rate=0,
name="lstm", print_repr=True, w=None, b=None,
gamma=None, mean=None, std=None, gradnet_mode=None):
parent_nodes = (parent, memory_states)
super(LSTM, self).__init__(parent_nodes, name, print_repr)
self.n_f = n_f
self.n_f_memory = memory_states.shape['f']
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.spatial_axes = parent.shape.spatial_axes
self.flatten = flatten
if flatten:
raise NotImplementedError("Flatten is not yet supported for LSTM.")
else:
n_in = parent.shape['f']
n_comb = n_f + n_in
if w != None or b != None:
raise NotImplementedError("Initial weights are not yet supported for LSTM.")
if self.n_f_memory != 2*n_f:
raise ValueError("n_f of memory_states must be 2*n_f!")
if parent.shape.hastag('r'):
raise ValueError("Input must not have 'r' axis")
w_sh = (n_comb, 4*n_f) # f, i, o, C
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
parent = self.parent[0].output
memory = self.parent[1].output
broad_caster_shape = list(parent.shape)
broad_caster_shape[self.axis] = self.n_f_memory
broad_caster = T.ones(broad_caster_shape, dtype=memory.dtype)
memory = memory * broad_caster
slice_obj = [slice(None) for i in range(len(self.parent[1].shape))]
slice_obj[self.parent[1].shape.tag2index('f')] = slice(self.n_f)
feed_back = memory[slice_obj]
slice_obj[self.parent[1].shape.tag2index('f')] = slice(self.n_f, None)
cell_state = memory[slice_obj]
input_tensor = T.concatenate([feed_back, parent] ,
axis=self.axis) #h, x
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.n_f /= r
if activation_func=='prelu':
b = self.b[:, 0].dimshuffle(pattern)
b1 = self.b[:, 1]
b1_f = b1[:self.n_f].dimshuffle(pattern)
b1_i = b1[self.n_f:2*self.n_f].dimshuffle(pattern)
b1_o = b1[2*self.n_f:3*self.n_f].dimshuffle(pattern)
b1_c = b1[3*self.n_f:].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1_f = None
b1_i = None
b1_o = None
b1_c = None
lin_output = computations.dot(input_tensor, self.w, self.axis)
if self.batch_normalisation in ['train', 'fadeout']:
raise NotImplementedError("Batch normalisation not yet supported for LSTM.")
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
raise NotImplementedError("Batch normalisation not yet supported for LSTM.")
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
slice_obj = [slice(None) for i in range(lin_output.ndim)]
slice_obj[self.axis] = slice(self.n_f)
f = computations.apply_activation(lin_output[slice_obj], 'sig', b1_f)
slice_obj[self.axis] = slice(self.n_f, 2*self.n_f)
i = computations.apply_activation(lin_output[slice_obj], 'sig', b1_i)
slice_obj[self.axis] = slice(2*self.n_f, 3*self.n_f)
o = computations.apply_activation(lin_output[slice_obj], 'sig', b1_o)
slice_obj[self.axis] = slice(3*self.n_f, 4*self.n_f)
c_tilde = computations.apply_activation(lin_output[slice_obj], activation_func, b1_c)
cell_out = f * cell_state + i * c_tilde
lin_output = o * computations.apply_activation(cell_out, activation_func, None)
if self.dropout_rate:
raise NotImplementedError("Dropout not yet supported for LSTM.")
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output = lin_output * dropout_gate.dimshuffle(('x', 0))
self.output = T.concatenate([lin_output, cell_out], axis=self.axis)
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent[0].shape
if self.flatten:
self.shape = TaggedShape((sh['b'], 2*self.n_f), 'b,f')
else:
self.shape = sh.updateshape('f',2* self.n_f)
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
n = self.parent[0].shape.stripnone_prod
self.computational_cost = 4 * n * self.n_f
[docs]class LRN(Node):
"""
LRN (Local Response Normalization) layer.
Parameters
----------
parent: Node
The input node.
filter_shape: tuple
mode: str
Can be "spatial" or "channel".
alpha: float
k: float
beta: float
name: str
Node name.
print_repr: bool
Whether to print the node representation upon initialisation.
"""
def __init__(self, parent, filter_shape, mode='spatial', alpha=0.0001,
k=1, beta=0.75, name="LRN", print_repr=True):
super(LRN, self).__init__(parent, name, print_repr)
self.mode = mode
self.filter_shape = filter_shape
self.axis = parent.shape.tag2index('f') # retrieve feature shape's index
if mode=='spatial':
self.axis_order = None
self.spatial_axes = self.parent.shape.spatial_axes
conv_dim = len(self.spatial_axes)
x_dim = len(self.parent.shape)
if len(self.spatial_axes)!=len(filter_shape):
raise ValueError("The filter_shape dimensionality (%i) and the number"
"of spatial dimensions in the input (%i)differ!"
"Use filtersize 1 on axes which should not be"
"averaged."\
%(len(filter_shape), conv_dim, ))
n_in = parent.shape['f'] #retrieve feature shape
fail = False
if conv_dim==1:
if x_dim!=3 or self.spatial_axes!=[2]:
fail = True
w_sh = [n_in, n_in] + list(filter_shape)
elif conv_dim==2:
if x_dim!=4 or self.spatial_axes!=[2,3]:
fail = True
w_sh = [n_in, n_in] + list(filter_shape)
elif conv_dim==3:
if x_dim!=5:
fail = True
if self.spatial_axes==[2,3,4]:
self.axis_order = 'dnn'
w_sh = [n_in, n_in] + list(filter_shape)
elif self.spatial_axes==[1,3,4]:
self.axis_order = 'theano'
w_sh = [n_in, filter_shape[0], n_in] + list(filter_shape[1:])
else:
fail = True
if fail:
raise NotImplementedError("Cannot convolve non-standard shapes / axis orders,"
"implement reshaping before conv"
"and re-reshaping afer!")
self.conv_dim = conv_dim
self.w_sh = w_sh
value = np.zeros(w_sh, dtype=floatX)
val = 1.0 / np.product(filter_shape)
for i in range(n_in):
value[i,i] = val
self.average_filter = ConstantParam(value, '<%s_filter%s>'%(self.name, tuple(w_sh)))
self.params['average_filter'] = self.average_filter
elif mode=='channel':
assert isinstance(filter_shape, int)
assert filter_shape%2==1
else:
raise ValueError("Unknow mode %s"%mode)
self.alpha = VariableParam(value=alpha, name="alpha",
dtype=floatX, apply_train=False)
self.beta = VariableParam(value=beta,name="beta",
dtype=floatX, apply_train=False)
self.k = VariableParam(value=k,name="k",
dtype=floatX, apply_train=False)
self.params['alpha'] = self.alpha
self.params['beta'] = self.beta
self.params['k'] = self.k
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
input_shape = list(self.parent.shape)
if self.mode=='spatial':
mean_square = computations.conv(T.square(input_tensor), self.average_filter,
self.axis_order, border_mode='same',
x_shape=input_shape, w_shape=self.w_sh)
else:
n_f = input_shape[self.axis]
in_square = T.square(input_tensor)
half_n = self.filter_shape // 2
new_sh = list(input_tensor.shape)
new_sh[self.axis] += 2 * half_n
in_square_ext = T.zeros(new_sh, floatX)
slicer = [slice(None)] * input_tensor.ndim
slicer[self.axis] = slice(half_n,half_n+n_f)
in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square)
# pad left
slicer[self.axis] = slice(0, half_n)
pad_slicer = [slice(None)] * input_tensor.ndim
pad_slicer[self.axis] = slice(0, 1)
in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square[pad_slicer])
# pad right
slicer[self.axis] = slice(half_n+n_f, 2*half_n+n_f)
pad_slicer[self.axis] = slice(n_f-1,n_f)
in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square[pad_slicer])
mean_square = 0
for i in range(self.filter_shape):
slicer[self.axis] = slice(i,i+n_f)
mean_square += in_square_ext[slicer]
mean_square /= self.filter_shape
divisor = T.power(self.k + self.alpha * mean_square, self.beta)
self.output = input_tensor / divisor
self._debug_outputs = [mean_square, divisor]