Source code for tltorch._factorized_linear

"""Tensor Regression Layers
"""

# Author: Jean Kossaifi
# License: BSD 3 clause

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

import tensorly as tl
tl.set_backend('pytorch')
from tensorly import tenalg
from tensorly.random import random_tucker, random_cp, random_tt, random_tt_matrix
from tensorly.decomposition import parafac, tucker, tensor_train, tensor_train_matrix
from tensorly import random
from tensorly import testing
from tensorly import (validate_tt_rank, validate_cp_rank, 
                      validate_tucker_rank, validate_tt_matrix_rank)

from .base import TensorModule
from . import init


class BaseFactorizedLinear(TensorModule):
    """Tensorized Fully-Connected Layers

        The weight matrice is tensorized to a tensor of size `tensorized_shape`.
        That tensor is expressed as a low-rank tensor.
        During inference, the full tensor is reconstructed, and unfolded back into a matrix, 
        used for the forward pass in a regular linear layer.

    Parameters
    ----------
    in_features : int
    out_features : int
    tensorized_shape : int tuple
    rank : int tuple or str
    bias : bool, default is True
    """
    def __init__(self, in_features, out_features, tensorized_shape, rank, bias=True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.tensorized_shape = tensorized_shape
        self.weight_shape = (out_features, in_features)
        self.rank = rank
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

    @classmethod
    def from_linear(cls, linear, tensorized_shape, rank, bias=True):
        """Class method to create an instance from an existing linear layer

        Parameters
        ----------
        linear : torch.nn.Linear
            layer to tensorize
        tensorized_shape : tuple
            shape to tensorized the weight matrix to.
            Must verify np.prod(tensorized_shape) == np.prod(linear.weight.shape)
        rank :  {rank of the decomposition, 'same', float}
            if float, percentage of parameters of the original weights to use
            if 'same' use the same number of parameters
        bias : bool, default is True
        """
        out_features, in_features = linear.weight.shape
        instance = cls(in_features, out_features, tensorized_shape=tensorized_shape, rank=rank, bias=bias)
        instance.init_from_tensor(linear.weight, linear.bias)
        return instance

    def __repr__(self):
        msg = f'Factorized {self.__class__.__name__} with {self.in_features} inputs and {self.out_features} outputs.\n'
        msg += f'  Weight ({self.out_features}, {self.in_features}) tensorized to {self.tensorized_shape} with rank {self.rank}.'
        return msg 

    def __getattr__(self, name):
        """Hack for PyTorch to be able to use the full reconstructed weight in attention layers

        Simply defining a property `weight` is not sufficient as PyTorch will first look 
        in self._parameters
        """
        if name == 'weight':
            return self.full_weight
        else:
            return super().__getattr__(name)

    
[docs]class TuckerLinear(BaseFactorizedLinear):
    """Tensorized Fully-Connected Layers

        The weight matrice is tensorized to a tensor of size `tensorized_shape`.
        That tensor is expressed as a low-rank (Tucker) tensor.
        During inference, the full tensor is reconstructed, and unfolded back into a matrix, 
        used for the forward pass in a regular linear layer.

    Parameters
    ----------
    in_features : int
    out_features : int
    tensorized_shape : int tuple
    rank : int tuple or str
    bias : bool, default is True

    See also
    --------
    TTLinear
    CPLinear
    """
    def __init__(self, in_features, out_features, tensorized_shape, rank, bias=True):
        super().__init__(in_features, out_features, tensorized_shape, rank, bias=bias)
        self.rank = validate_tucker_rank(tensorized_shape, rank=rank)           

        self.core = nn.Parameter(torch.Tensor(*self.rank))
        self.factors = nn.ParameterList(nn.Parameter(torch.Tensor(s, r))\
                                        for (s, r) in zip(tensorized_shape, self.rank))

        self.init_from_random(False)

[docs]    def forward(self, input):
        """Inference using the tensorized and factorized weight matrix"""
        weight = tl.tucker_to_tensor(self._process_decomposition()).reshape(self.weight_shape)
        
        return F.linear(input, weight, self.bias)

[docs]    def init_from_random(self, decompose_full_weight=True):
        """Initializes the factorization randomly

        Parameters
        ----------
        decompose_full_weight : bool, default is True
            if True, a full weight is created and decomposed to initialize the factors
            otherwise, the factors of the decomposition are directly initialized
        """
        if decompose_full_weight:
            full_weight = torch.normal(0.0, 0.02, size=self.weight_shape)
            self.init_from_tensor(full_weight)
        else:
            init.tucker_init(self.core, self.factors)
        if self.bias is not None:
            self.bias.data.zero_()

[docs]    def init_from_decomposition(self, tucker_tensor, bias=None):
        """Initializes the factorization from the given decomposition

        Parameters
        ----------
        tucker_tensor : (core, factors)
            values to initialize the decomposition parametrizing the layer to
        bias : torch.Tensor or None, default is None
        """
        core, factors = tucker_tensor
        with torch.no_grad():
            for i, f in enumerate(factors):
                self.factors[i].data = f
            self.core.data = core

        if self.bias is not None and bias is not None:
            self.bias.data = bias

[docs]    def init_from_tensor(self, tensor, bias=None, decomposition_kwargs=None):
        """Initializes the layer by decomposing a full tensor

        Parameters
        ----------
        tensor : torch.Tensor
            must be either a matrix or a tensor
            must verify ``np.prod(tensor.shape) == np.prod(self.tensorized_shape)``
        bias : torch.Tensor or None, default is None
        decomposition_kwargs : dict
            dictionary of parameters passed directly to TensorLy for the decomposition
        """
        with torch.no_grad():
            tensor = tensor.reshape(self.tensorized_shape)
            tucker_tensor = tucker(tensor, rank=self.rank, init='random')
        self.init_from_decomposition(tucker_tensor, bias=bias)

    @property
    def full_weight(self):
        """Returns the reconstruced matrix weight of the linear layer
        """
        return tl.reshape(tl.tucker_to_tensor((self.core, self.factors)), self.weight_shape)

[docs]    def get_decomposition(self):
        """Returns the decomposition parametrizing the layer
        """
        return self.core, self.factors

[docs]class CPLinear(BaseFactorizedLinear):
    """Tensorized Fully-Connected Layers

        The weight matrice is tensorized to a tensor of size `tensorized_shape`.
        That tensor is expressed as a low-rank (CP) tensor.
        During inference, the full tensor is reconstructed, and unfolded back into a matrix, 
        used for the forward pass in a regular linear layer.

    Parameters
    ----------
    in_features : int
    out_features : int
    tensorized_shape : int tuple
    rank : int tuple or str
    bias : bool, default is True

    See also
    --------
    TTLinear
    TuckerLinear
    """
    def __init__(self, in_features, out_features, tensorized_shape, rank, bias=True):
        super().__init__(in_features, out_features, tensorized_shape, rank, bias=bias)
        self.rank = validate_cp_rank(tensorized_shape, rank=rank)           

        self.weights = nn.Parameter(torch.Tensor(self.rank))
        self.factors = nn.ParameterList(nn.Parameter(torch.Tensor(s, self.rank)) for s in tensorized_shape)

        self.init_from_random(decompose_full_weight=False)

[docs]    def forward(self, input):
        """Inference using the tensorized and factorized weight matrix"""
        weight = tl.cp_to_tensor(self._process_decomposition()).reshape(self.weight_shape)
        
        return F.linear(input, weight, self.bias)

[docs]    def init_from_random(self, decompose_full_weight=True):
        """Initializes the factorization randomly

        Parameters
        ----------
        decompose_full_weight : bool, default is True
            if True, a full weight is created and decomposed to initialize the factors
            otherwise, the factors of the decomposition are directly initialized
        """
        if decompose_full_weight:
            full_weight = torch.normal(0.0, 0.02, size=self.weight_shape)
            self.init_from_tensor(full_weight)
        else:
            init.cp_init(self.weights, self.factors)
        if self.bias is not None:
            self.bias.data.zero_()

[docs]    def init_from_decomposition(self, cp_tensor, bias=None):
        """Initializes the factorization from the given decomposition

        Parameters
        ----------
        tucker_tensor : (weights, factors)
            values to initialize the decomposition parametrizing the layer to
        bias : torch.Tensor or None, default is None
        """
        weights, factors = cp_tensor
        with torch.no_grad():
            for i, f in enumerate(factors):
                self.factors[i].data = f
            self.weights.data = weights
            
            if self.bias is not None and bias is not None:
                self.bias.data = bias

[docs]    def init_from_tensor(self, tensor, bias=None, decomposition_kwargs=dict(init='random')):
        """Initializes the layer by decomposing a full tensor

        Parameters
        ----------
        tensor : torch.Tensor
            must be either a matrix or a tensor
            must verify ``np.prod(tensor.shape) == np.prod(self.tensorized_shape)``
        bias : torch.Tensor or None, default is None
        """
        with torch.no_grad():
            tensor = tensor.reshape(self.tensorized_shape)
            print(tensor.shape)
            cp_tensor = parafac(tensor, rank=self.rank, **decomposition_kwargs)
        self.init_from_decomposition(cp_tensor, bias=bias)

[docs]    def get_decomposition(self):
        """Returns the decomposition parametrizing the layer
        """
        return (self.weights, self.factors)

    @property
    def full_weight(self):
        """Returns the reconstruced matrix weight of the linear layer
        """
        return tl.reshape(tl.cp_to_tensor((self.weights, self.factors)), self.weight_shape)


[docs]class TTLinear(BaseFactorizedLinear):
    """Tensorized Fully-Connected Layers

        The weight matrice is tensorized to a tensor of size `tensorized_shape`.
        That tensor is expressed as a low-rank (TT) tensor.
        During inference, the full tensor is reconstructed, and unfolded back into a matrix, 
        used for the forward pass in a regular linear layer.

    Parameters
    ----------
    in_features : int
    out_features : int
    tensorized_shape : int tuple
    rank : int tuple or str
    bias : bool, default is True

    See also
    --------
    TuckerLinear
    CPLinear
    TTMLinear
    
    Notes
    -----
    This is very similar to [1]_ except that the weight matrix is simply **reshaped** into a tensor,
    while in [1]_, the dimensions are then also permuted in order to jointly compress input and outputs.
    The original [1]_ is implemented in :func:`tltorch.TTMLinear` .

    References
    ----------
    .. [1] Tensorizing Neural Networks, Alexander Novikov, Dmitry Podoprikhin, Anton Osokin, Dmitry Vetrov
    """
    def __init__(self, in_features, out_features, tensorized_shape, rank, bias=True):
        super().__init__(in_features, out_features, tensorized_shape, rank, bias=bias)
        self.rank = validate_tt_rank(tensorized_shape, rank=rank)           
        self.factors = nn.ParameterList()
        for i, s in enumerate(self.tensorized_shape):
            self.factors.append(nn.Parameter(torch.Tensor(self.rank[i], s, self.rank[i+1])))

        # Things like setting the tt_shape above are the init is not in the base class
        self.init_from_random(decompose_full_weight=False)

[docs]    def forward(self, input):
        """Inference using the tensorized and factorized weight matrix"""
        weight = tl.tt_to_tensor(self._process_decomposition()).reshape(self.weight_shape)
        
        return F.linear(input, weight, self.bias)

[docs]    def init_from_random(self, decompose_full_weight=True):
        """Initializes the factorization randomly

        Parameters
        ----------
        decompose_full_weight : bool, default is True
            if True, a full weight is created and decomposed to initialize the factors
            otherwise, the factors of the decomposition are directly initialized
        """
        if decompose_full_weight:
            full_weight = torch.normal(0.0, 0.02, size=self.tensorized_shape)
            self.init_from_tensor(full_weight)
        else:
            init.tt_init(self.factors)
        if self.bias is not None:
            self.bias.data.zero_()

[docs]    def init_from_decomposition(self, tt_tensor, bias=None):
        """Initializes the factorization from the given decomposition

        Parameters
        ----------
        tucker_tensor : (factors)
            values to initialize the decomposition parametrizing the layer to
        bias : torch.Tensor or None, default is None
        """
        factors = tt_tensor
        for i, factor in enumerate(factors):
            self.factors[i].data = factor

        if self.bias is not None and bias is not None:
            self.bias.data = bias

[docs]    def init_from_tensor(self, tensor, bias=None, decomposition_kwargs=dict()):
        """Initializes the layer by decomposing a full tensor

        Parameters
        ----------
        tensor : torch.Tensor
            must be either a matrix or a tensor
            must verify ``np.prod(tensor.shape) == np.prod(self.tensorized_shape)``
        bias : torch.Tensor or None, default is None
        """
        with torch.no_grad():
            tensor = tensor.reshape(self.tensorized_shape)
            tt_tensor = tensor_train(tensor, rank=self.rank, **decomposition_kwargs)
        self.init_from_decomposition(tt_tensor, bias=bias)

[docs]    def get_decomposition(self):
        """Returns the decomposition parametrizing the layer
        """
        return self.factors

    @property
    def full_weight(self):
        """Returns the reconstruced matrix weight of the linear layer
        """
        return tl.reshape(tl.tt_to_tensor(self.factors), self.weight_shape)


[docs]class TTMLinear(BaseFactorizedLinear):
    """Tensorized Fully-Connected Layers in the TT-Matrix format [1]_

        The weight matrice is tensorized to a tensor of size `tensorized_shape`.
        That tensor is expressed as a low-rank TT-Matrix by jointly compressing inputs and outputs.
        During inference, the full tensor is reconstructed, and unfolded back into a matrix, 
        used for the forward pass in a regular linear layer.

    Parameters
    ----------
    in_features : int
    out_features : int
    tensorized_shape : int tuple
        should be left_shape + right_shape correponsding to a weight matrix of size left x right
    rank : int tuple or str, default is 'same'
    bias : bool, default is True

    See also
    --------
    TuckerLinear
    CPLinear
    TTLinear
    
    Notes
    -----
    This layer permutes the dimensions of the weight matrix after it has been reshaped into 
    a higher-order tensor of shape `tensorized_shape`.

    For a linear layer with `out_feature = O_1 * O_2 * O_3` and `in_features = I_1 * I_2 * I_3`
    and `tensorized_shape = (O_1, O_2, O_3, I_1, I_2, I_3)` and rank `R = (R_1, R_2, R_3, R_4)` with `R_1 = R_4 = 1`.
    the inputs and outputs will be jointly compressed by each tt-matrix core.
    In other words, the k-th core will be of shape `(R_k, O_k, I_k, R_{k+1})`.

    By contrast, :func:`tltorch.TTLinear` simply reshapes the matrix to `tensorized_shape` 
    and compresses with a tensor-train decomposition.

    References
    ----------
    .. [1] Tensorizing Neural Networks, Alexander Novikov, Dmitry Podoprikhin, Anton Osokin, Dmitry Vetrov
    """
    def __init__(self, in_features, out_features, tensorized_shape, rank='same', bias=True):
        super().__init__(in_features, out_features, tensorized_shape, rank, bias=bias)
        self.rank = validate_tt_matrix_rank(tensorized_shape, rank=rank)           
        self.factors = nn.ParameterList()
        self.ndim = len(tensorized_shape) // 2
        self.out_shape = tensorized_shape[:self.ndim]
        self.in_shape = tensorized_shape[self.ndim:]

        for i, (s_out, s_in) in enumerate(zip(self.out_shape, self.in_shape)):
            self.factors.append(nn.Parameter(torch.Tensor(self.rank[i], s_out, s_in, self.rank[i+1])))

        # Things like setting the tt_shape above are the init is not in the base class
        self.init_from_random(decompose_full_weight=False)

[docs]    def forward(self, input):
        """Inference using the tensorized and factorized weight matrix"""
        weight = tl.tt_matrix_to_tensor(self._process_decomposition()).reshape(self.weight_shape)
        
        return F.linear(input, weight, self.bias)

[docs]    def init_from_random(self, decompose_full_weight=True):
        """Initializes the factorization randomly

        Parameters
        ----------
        decompose_full_weight : bool, default is True
            if True, a full weight is created and decomposed to initialize the factors
            otherwise, the factors of the decomposition are directly initialized
        """
        if decompose_full_weight:
            full_weight = torch.normal(0.0, 0.02, size=self.tensorized_shape)
            self.init_from_tensor(full_weight)
        else:
            init.tt_matrix_init(self.factors)
        if self.bias is not None:
            self.bias.data.zero_()

[docs]    def init_from_decomposition(self, tt_matrix, bias=None):
        """Initializes the factorization from the given decomposition

        Parameters
        ----------
        tucker_tensor : (factors)
            values to initialize the decomposition parametrizing the layer to
        bias : torch.Tensor or None, default is None
        """
        factors = tt_matrix
        for i, factor in enumerate(factors):
            self.factors[i].data = factor

        if self.bias is not None and bias is not None:
            self.bias.data = bias

[docs]    def init_from_tensor(self, tensor, bias=None):
        """Initializes the layer by decomposing a full tensor

        Parameters
        ----------
        tensor : torch.Tensor
            must be either a matrix or a tensor
            must verify ``np.prod(tensor.shape) == np.prod(self.tensorized_shape)``
        bias : torch.Tensor or None, default is None
        """
        with torch.no_grad():
            tensor = tensor.reshape(self.tensorized_shape)
            tt_matrix = tensor_train_matrix(tensor, rank=self.rank)
        self.init_from_decomposition(tt_matrix, bias=bias)

[docs]    def get_decomposition(self):
        """Returns the decomposition parametrizing the layer
        """
        return self.factors

    @property
    def full_weight(self):
        """Returns the reconstruced matrix weight of the linear layer
        """
        return tl.reshape(tl.tt_matrix_to_tensor(self.factors), self.weight_shape)