Source code for tlviz.postprocessing

# -*- coding: utf-8 -*-

__author__ = "Marie Roald & Yngve Mardal Moe"

from warnings import warn

import numpy as np
import scipy.linalg as sla

from . import factor_tools
from ._module_utils import is_iterable
from ._tl_utils import _handle_tensorly_backends_cp, _handle_tensorly_backends_dataset
from ._xarray_wrapper import (
    _SINGLETON,
    _handle_labelled_cp,
    _handle_labelled_dataset,
    add_factor_metadata,
    label_cp_tensor,
)
from .utils import unfold_tensor

__all__ = [
    "resolve_cp_sign_indeterminacy",
    "postprocess",
    "factor_matrix_to_tidy",
    "add_factor_metadata",
    "label_cp_tensor",
]


[docs]@_handle_tensorly_backends_dataset("dataset", None)
@_handle_tensorly_backends_cp("cp_tensor", _SINGLETON)
@_handle_labelled_dataset("dataset", None)
@_handle_labelled_cp("cp_tensor", _SINGLETON)
def resolve_cp_sign_indeterminacy(cp_tensor, dataset, resolve_mode=None, unresolved_mode=-1, method="transpose"):
    r"""Resolve the sign indeterminacy of CP models.

    Tensor factorisations have a sign indeterminacy that allows any change in the
    sign of the component vectors in one mode, under the condition that the sign
    of a component vector in another mode changes as well. This means that we can
    "flip" any component vector so long as the corresponding component vector in
    another mode is also flipped. This flipping can hurt the model's interpretability.
    For example, if a factor represents a chemical spectrum, then this flipping may lead
    to it being negative instead of positive.

    To illustrate the sign indeterminacy, we start with the SVD, which is on the form

    .. math::

        \mathbf{X} = \mathbf{U} \mathbf{S} \mathbf{V}^\mathsf{T}.

    The factorisation above is equivalent with the following factorisation:

    .. math::
        \mathbf{X} = (\mathbf{U} \text{diag}(\mathbf{f})) \mathbf{S} (\mathbf{V} \text{diag}(\mathbf{f}))^\mathsf{T},

    where :math:`\mathbf{f}` is a vector containing only ones or negative ones. Similarly,
    a CP factorisation with factor matrices :math:`\mathbf{A}, \mathbf{B}` and :math:`\mathbf{C}`
    is equivalent to the CP factorisations with the following factor matrices:

     * :math:`(\mathbf{A} \text{diag}(\mathbf{f})), (\mathbf{B} \text{diag}(\mathbf{f}))` and :math:`\mathbf{C}`
     * :math:`(\mathbf{A} \text{diag}(\mathbf{f})), \mathbf{B}` and :math:`(\mathbf{C} \text{diag}(\mathbf{f}))`
     * :math:`\mathbf{A}, (\mathbf{B} \text{diag}(\mathbf{f}))` and :math:`(\mathbf{C} \text{diag}(\mathbf{f}))`

    One way to circumvent the sign indeterminacy is by imposing non-negativity. However,
    that is not always a reasonable choice (e.g. if the data also contains negative entries).
    When we don't want to impose non-negativity constraints, then we need some other way to
    resolve the sign indeterminacy (which this function provides). The idea is easiest described
    in the two-way (matrix) case.

    Consider a data matrix, :math:`\mathbf{X}` whose columns represent samples and rows represent
    measurements. Then, we want the measurement-mode component-vectors to be mostly aligned with
    the data matrix. The components should describe what the data is, not what it is not.
    For example, if the data is non-negative, then the measurement-mode component vectors should
    be mostly non-negative. With the SVD, we can compute whether we should flip the :math:`r`-th
    column of :math:`\mathbf{U}` by computing

    .. math::

        f_r = \sum_{i=1^I} v_{ir}^2 \text{sign}{v_{ir}}

    if :math:`f_r` is negative, then we should flip the sign of the :math:`r-th` column of
    :math:`\mathbf{U}` and :math:`\mathbf{V}` :cite:p:`bro2008resolving`.

    The methodology above works well in practice, and is rooted in the fact that the
    :math:`i`-th row of :math:`\mathbf{V}` can be interpreted as the coordinates of the
    :math:`i`-th row of :math:`\mathbf{X}` in a vector space spanned by the columns of
    :math:`\mathbf{U}`. Then, the above equation will give us component vectors where the data
    points is mainly located in the non-negative orthant.

    The above interpretation is correct under the assumption: :math:`\mathbf{U}^\mathsf{T}\mathbf{U} = \mathbf{I}`.
    However, the heuristic still works well when this is not the case :cite:p:`bro2013solving`.
    Still, we also include a modification of the above scheme where the same interpretation
    holds with non-orthogonal factors.

    .. math::

        f_r = \sum_{i=1}^I h_{ir}^2 \text{sign}(h_{ir}),

    where :math:`\mathbf{H} = \mathbf{U}(\mathbf{U}^\mathsf{T}\mathbf{U})^{-1} \mathbf{X}`.
    That is the rows of :math:`\mathbf{H}` represent the rows of :math:`\mathbf{X}` as
    described by the column basis of :math:`\mathbf{U}`.

    In the multiway case, when :math:`\mathcal{X}` is a tensor instead of a matrix, we can
    apply the same logic :cite:p:`bro2013solving`. If we have the factor matrices
    :math:`\mathbf{A}, \mathbf{B}` and :math:`\mathbf{C}`, then we flip the sign of any
    factor matrix (e.g. :math:`\mathbf{A}`) by computing

    .. math::

        f_r^{(\mathbf{A})} = \sum_{i=1}^I {h_{ir}^{(\mathbf{A})}}^2 \text{sign}({h_{ir}^{(\mathbf{A})}}),

    where :math:`\mathbf{H}^{(\mathbf{A})} = \mathbf{A}^\mathsf{T} \mathbf{X}_{(0)}` or
    :math:`\mathbf{H}^{(\mathbf{A})} = \mathbf{A}(\mathbf{A}^\mathsf{T}\mathbf{A})^{-1} \mathbf{X}_{(0)}`,
    depending on whether the scheme based on the SVD scheme :cite:p:`bro2013solving` or the
    corrected scheme. :math:`\mathbf{X}_{(0)} \in \mathbb{R}^{I \times JK}` is the tensor,
    :math:`\mathcal{X}`, unfolded along the first mode. We can then correct the sign of
    :math:`\mathbf{A}` by multiplying and one of the other factor matrices by
    :math:`\text{diag}(\mathbf{f}^{(\mathbf{A})})`. By using this procedure, we can align all
    factor matrices except for one (the unresolved mode) with the "direction of the data".

    Note that this sign indeterminacy comes as a direct consequence of the scaling indeterminacy
    of component models, since :math:`\text{diag}(\mathbf{f})^{-1} = \text{diag}(\mathbf{f})`.

    Parameters
    ----------
    cp_tensor : CPTensor or tuple
        TensorLy-style CPTensor object or tuple with weights as first
        argument and a tuple of components as second argument.
    resolve_mode : int, iterable or None
        Mode(s) whose factor matrix should be aligned with the data. If
        None, then the sign should be corrected for all modes except the
        ``unresolved_mode``.
    unresolved_mode : int
        Mode used to correct the sign indeterminacy in other mode(s). The
        factor matrix in this mode may not be aligned with the data.
    method : "transpose" or "positive_coord"
        Which method to use when computing the signs. Use ``"transpose"``
        for the method in :cite:p:`bro2008resolving,bro2013solving`, and
        ``"positive_coord"`` for the method corrected for non-orthogonal
        factor matrices described above.

    Returns
    -------
    CPTensor or tuple
        The CP tensor after correcting the signs.

    Raises
    ------
    ValueError
        If ``unresolved_mode`` is not between ``-dataset.ndim`` and ``dataset.ndim-1``.
    ValueError
        If ``unresolved_mode`` is in ``resolve_mode``

    Notes
    -----
    For more information, see :cite:p:`bro2008resolving,bro2013solving`
    """
    if unresolved_mode < 0:
        unresolved_mode = dataset.ndim + unresolved_mode
    if unresolved_mode >= dataset.ndim or unresolved_mode < 0:
        raise ValueError("`unresolved_mode` must be between `-dataset.ndim` and `dataset.ndim-1`.")
    if resolve_mode == unresolved_mode or (is_iterable(resolve_mode) and unresolved_mode in resolve_mode):
        raise ValueError("The unresolved mode cannot be resolved.")

    if resolve_mode is None:
        resolve_mode = range(dataset.ndim)

    if is_iterable(resolve_mode):
        for mode in resolve_mode:
            if mode != unresolved_mode:
                cp_tensor = resolve_cp_sign_indeterminacy(
                    cp_tensor,
                    dataset,
                    unresolved_mode=unresolved_mode,
                    resolve_mode=mode,
                    method=method,
                )

        return cp_tensor

    unfolded_dataset = unfold_tensor(dataset, resolve_mode)
    factor_matrix = cp_tensor[1][resolve_mode]

    if method == "transpose":
        sign_scores = factor_matrix.T @ unfolded_dataset
    elif method == "positive_coord":
        sign_scores = sla.lstsq(factor_matrix, unfolded_dataset)[0]
    else:
        raise ValueError("Method must be either `transpose` or `positive_coord`")

    signs = np.sign(np.sum(sign_scores**2 * np.sign(sign_scores), axis=1))
    signs = np.asarray(signs).reshape(1, -1)

    factor_matrices = list(cp_tensor[1])
    factor_matrices[resolve_mode] = factor_matrices[resolve_mode] * signs
    factor_matrices[unresolved_mode] = factor_matrices[unresolved_mode] * signs
    return cp_tensor[0], tuple(factor_matrices)


[docs]@_handle_tensorly_backends_cp("reference_cp_tensor", None, optional=True)
@_handle_tensorly_backends_cp("cp_tensor", None)
@_handle_labelled_cp("reference_cp_tensor", None, optional=True)
def postprocess(
    cp_tensor,
    dataset=None,
    reference_cp_tensor=None,
    permute=True,
    resolve_mode=None,
    unresolved_mode=-1,
    flip_method="transpose",
    weight_behaviour="normalise",
    weight_mode=0,
    allow_smaller_rank=False,
    include_metadata=False,
):
    """Standard postprocessing of a CP decomposition.

    This function will perform standard postprocessing of a CP decomposition.
    If a reference CP tensor is provided, then the columns of ``cp_tensor``'s
    factor matrices are aligned with the columns of ``reference_cp_tensor``'s
    factor matrices.

    Next, the factor matrices of the CP tensor are scaled according the the specified
    weight behaviour (default is normalised).

    If a dataset is provided, then the sign indeterminacy is resolved and if the
    dataset is labelled (i.e. is an xarray or a dataframe), then the factor matrices
    of the CP tensor is labelled too.

    This function is equivalent to calling

     * ``permute_cp_tensor``
     * ``distribute_weights``
     * ``resolve_cp_sign_indeterminacy``
     * ``label_cp_tensor``

    Parameters
    ----------
    cp_tensor : CPTensor or tuple
        CPTensor to postprocess
    dataset : ndarray or xarray (optional)
        Dataset the CP tensor represents
    reference_cp_tensor : CPTensor or tuple (optional)
        If provided, then the tensor whose factors we align the CP tensor's
        columns with.
    permute : bool
        If ``True``, then the factors are permuted in descending weight order if
        ``reference_cp_tensor`` is ``None``.
    resolve_mode : int, iterable or None
        Mode(s) whose factor matrix should be aligned with the data. If
        None, then the sign should be corrected for all modes except the
        ``unresolved_mode``.
    unresolved_mode : int
        Mode used to correct the sign indeterminacy in other mode(s). The
        factor matrix in this mode may not be aligned with the data.
    method : "transpose" or "positive_coord"
        Which method to use when computing the signs. Use ``"transpose"``
        for the method in :cite:p:`bro2008resolving,bro2013solving`, and
        ``"positive_coord"`` for the method corrected for non-orthogonal
        factor matrices described above.
    weight_behaviour : {"ignore", "normalise", "evenly", "one_mode"} (default="normalise")
        How to handle the component weights.

         * ``"ignore"`` - Do nothing
         * ``"normalise"`` - Normalise all factor matrices
         * ``"evenly"`` - All factor matrices have equal norm
         * ``"one_mode"`` - The weight is allocated in one mode, all other factor matrices have unit norm columns.

    weight_mode : int (optional)
        Which mode to have the component weights in (only used if ``weight_behaviour="one_mode"``)
    allow_smaller_rank : bool (default=False)
        If ``True``, then a low rank decomposition can be permuted against one with higher rank. The "missing columns"
        are padded by nan values
    include_metadata : bool (default=True)
        If ``True``, then the factor metadata will be added as columns in the factor matrices.


    Returns
    -------
    CPTensor
        The post processed CPTensor.

    See Also
    --------
    tlviz.factor_tools.permute_cp_tensor
    tlviz.factor_tools.distribute_weights
    tlviz.postprocessing.resolve_cp_sign_indeterminacy
    tlviz.postprocessing.label_cp_tensor

    Examples
    --------

    Here is an example were we use postprocess on a decomposition of aminoacid data

    .. plot::
        :context: close-figs
        :include-source:

        >>> import tlviz
        >>> import numpy as np
        >>> import matplotlib.pyplot as plt
        >>> from tensorly.decomposition import parafac
        >>> dataset = tlviz.data.load_aminoacids()
        Loading Aminoacids dataset from:
        Bro, R, PARAFAC: Tutorial and applications, Chemometrics and Intelligent Laboratory Systems, 1997, 38, 149-171

        The dataset is an xarray DataArray and it contains relevant side information

        >>> print(type(dataset))
        <class 'xarray.core.dataarray.DataArray'>

        We see that after postprocessing, the cp_tensor contains pandas DataFrames

        >>> cp_tensor = parafac(dataset.data, 3, init="random", random_state=0)
        >>> cp_tensor_postprocessed = tlviz.postprocessing.postprocess(cp_tensor, dataset)
        >>> print(type(cp_tensor[1][0]))
        <class 'numpy.ndarray'>
        >>> print(type(cp_tensor_postprocessed[1][0]))
        <class 'pandas.core.frame.DataFrame'>


        We see that after postprocessing, the factor matrix has unit norm

        >>> print(np.linalg.norm(cp_tensor[1][0], axis=0))
        [160.82985402 182.37338941 125.3689186 ]
        >>> print(np.linalg.norm(cp_tensor_postprocessed[1][0], axis=0))
        [1. 1. 1.]

        When we construct a dense tensor from a postprocessed cp_tensor it is constructed
        as an xarray DataArray

        >>> print(type(tlviz.utils.cp_to_tensor(cp_tensor)))
        <class 'numpy.ndarray'>
        >>> print(type(tlviz.utils.cp_to_tensor(cp_tensor_postprocessed)))
        <class 'xarray.core.dataarray.DataArray'>

        The visualisation of the postprocessed cp_tensor shows that the scaling and sign indeterminacy
        is taken care of and x-xaxis has correct labels and ticks

        >>> fig, ax = tlviz.visualisation.components_plot(cp_tensor)
        >>> plt.show()

        >>> fig, ax = tlviz.visualisation.components_plot(cp_tensor_postprocessed)
        >>> plt.show()

    """
    if not permute and reference_cp_tensor is not None:
        warn("``permute=False`` is ignored if a reference CP tensor is provided.")

    if permute or reference_cp_tensor is not None:
        cp_tensor = factor_tools.permute_cp_tensor(
            cp_tensor, reference_cp_tensor=reference_cp_tensor, allow_smaller_rank=allow_smaller_rank
        )

    cp_tensor = factor_tools.distribute_weights(cp_tensor, weight_behaviour=weight_behaviour, weight_mode=weight_mode)

    if dataset is not None:
        cp_tensor = resolve_cp_sign_indeterminacy(
            cp_tensor,
            dataset,
            resolve_mode=resolve_mode,
            unresolved_mode=unresolved_mode,
            method=flip_method,
        )

        cp_tensor = label_cp_tensor(cp_tensor, dataset)

    if include_metadata and dataset is not None:
        cp_tensor = add_factor_metadata(cp_tensor, dataset)
    elif include_metadata:
        warn("Cannot include metadata when there is no provided dataset")

    return cp_tensor


[docs]def factor_matrix_to_tidy(factor_matrix, var_name="Component", value_name="Signal", id_vars=None):
    """Convert a factor matrix into a tidy dataset, for use with Plotly Express.

    If we convert a factor matrix into a tidy dataset (or long table), then we get a table on the form

    .. list-table:: Tidy format factor matrix
        :widths: 25 25 25
        :header-rows: 1

        * - Index
          - Component
          - Signal
        * - 0
          - 0
          - 0.1
        * - 1
          - 0
          - 0.5
        * - ...
          - ...
          - ...
        * - 38
          - 2
          - 0.7
        * - 39
          - 2
          - 0.2

    The component vectors are all stacked on top of each other, with a separate column that specifies which
    component each row belongs to. This function can also preserve metadata, which is signified by columns
    that have non-integer column names. For example, if we have a dataframe on the form

    .. list-table:: Factor matrix with metadata
        :widths: 25 25 25 25 25 25
        :header-rows: 1

        * - Index
          - 0
          - 1
          - 2
          - lat
          - lon
        * - 0
          - 0.1
          - 0.2
          - 0.5
          - 59.91273
          - 10.74609
        * - 1
          - 0.5
          - 0.2
          - 0.1
          - 63.43049
          - 10.39506
        * - ...
          - ...
          - ...
          - ...
          - ...
          - ...
        * - 5
          - 0.2
          - 0.1
          - 0.3
          - 60.39299
          - 5.32415
        * - 5
          - 0.0
          - 0.2
          - 0.1
          - 58.97005
          - 5.73332

    and convert it into a tidy format factor matrix, then we get a table on the form

    .. list-table:: Tidy format factor matrix with metadata
        :widths: 25 25 25 25 25
        :header-rows: 1

        * - Index
          - lat
          - lon
          - Component
          - Signal
        * - 0
          - 59.91273
          - 10.74609
          - 0
          - 0.1
        * - 1
          - 63.43049
          - 10.39506
          - 0
          - 0.5
        * - ...
          - ...
          - ...
          - ...
          - ...
        * - 4
          - 69.6489
          - 18.95508
          - 2
          - 0.0
        * - 5
          - 58.97005
          - 5.73332
          - 2
          - 0.1

    Parameters
    ----------
    factor_matrix : pd.DataFrame
        A labelled factor matrix potentially with metadata columns
    var_name : str
        Name of the column that specifies which component each row belongs to
    value_name : str
        Name of the column that holds the magnitude of each component entry
    id_vars : iterable or None (default=None)
        Which columns to interpret as metadata. The columns not specified here are considered as the components.
        If ``id_vars is None``, then all columns with non-integer names are considered metadata columns.

    Returns
    -------
    pd.DataFrame
        Tidy format factor matrix

    Examples
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from tlviz.postprocessing import factor_matrix_to_tidy
    >>> rng = np.random.default_rng(0)
    >>> factor_matrix = pd.DataFrame(rng.uniform(size=(10, 3)))
    >>> factor_matrix.head()
              0         1         2
    0  0.636962  0.269787  0.040974
    1  0.016528  0.813270  0.912756
    2  0.606636  0.729497  0.543625
    3  0.935072  0.815854  0.002739
    4  0.857404  0.033586  0.729655
    >>> tidy_factor_matrix = factor_matrix_to_tidy(factor_matrix)
    >>> tidy_factor_matrix.head()
       index Component    Signal
    0      0         0  0.636962
    1      1         0  0.016528
    2      2         0  0.606636
    3      3         0  0.935072
    4      4         0  0.857404
    >>> factor_matrix_with_metadata = factor_matrix.copy()
    >>> factor_matrix_with_metadata["Metadata"] = rng.uniform(size=10)
    >>> factor_matrix_with_metadata.head()
              0         1         2  Metadata
    0  0.636962  0.269787  0.040974  0.688447
    1  0.016528  0.813270  0.912756  0.388921
    2  0.606636  0.729497  0.543625  0.135097
    3  0.935072  0.815854  0.002739  0.721488
    4  0.857404  0.033586  0.729655  0.525354
    >>> tidy_factor_matrix_with_metadata = factor_matrix_to_tidy(factor_matrix_with_metadata)
    >>> tidy_factor_matrix_with_metadata.head()
       Metadata  index Component    Signal
    0  0.688447      0         0  0.636962
    1  0.388921      1         0  0.016528
    2  0.135097      2         0  0.606636
    3  0.721488      3         0  0.935072
    4  0.525354      4         0  0.857404
    """
    factor_matrix = factor_matrix.reset_index()
    if id_vars is None:
        id_vars = set()
        for column in factor_matrix.columns:
            if type(column) != int:
                id_vars.add(column)
    id_vars = sorted(id_vars)

    return factor_matrix.melt(var_name=var_name, value_name=value_name, id_vars=id_vars)