Source code for tlda.tlda_wrapper

import pickle
from pathlib import Path
import tensorly as tl

from .second_order_cumulant import SecondOrderCumulant
from .third_order_cumulant import ThirdOrderCumulant


[docs]
class TLDA():
    """
    Class to learn topic-word distribution from a corpus of documents
    """
    def __init__(self, n_topic, alpha_0, n_iter_train, n_iter_test, learning_rate,
                pca_batch_size=10000, third_order_cumulant_batch=1000 , gamma_shape=1.0, smoothing=1e-6,
                theta=1, ortho_loss_criterion=1000, n_eigenvec = None, random_seed=None):
        """
        Parameters
        ----------
        n_topic : int
        alpha : int
        n_iter_train : int
        n_iter_test : int
        learning_rate : float
        pca_batch_size : int
        third_order_cumulant_batch : int
        random_seed: optional, default is None
        """
        self.n_topic   = n_topic
        self.alpha_0   = alpha_0
        self.smoothing = smoothing
        self.third_order_cumulant_batch = third_order_cumulant_batch
        if n_eigenvec is None:
            n_eigenvec = n_topic
        self.n_eigenvec = n_eigenvec

        self.weights_  = tl.ones(self.n_topic)
        self.vocab = 0
        self.n_documents = 0
        self.mean = None
        self.unwhitened_factors_ = None

        self.second_order = SecondOrderCumulant(n_eigenvec, alpha_0, pca_batch_size)
        self.third_order  = ThirdOrderCumulant(n_topic, alpha_0, n_iter_train, n_iter_test, third_order_cumulant_batch,
                                               learning_rate, gamma_shape, theta, ortho_loss_criterion, random_seed, n_eigenvec = n_eigenvec)


[docs]
    def fit(self, X, order = None):
        """
        Compute the word-topic distribution for the entire dataset at once. Assumes that the whole dataset and
        the tensors required to compute its word-topic distribution fit in memory.

        Parameters
        ----------
        X: tensor of size (self.n_documents , self.vocab) all documents used to fit the word-topic distribution
        """
        if order is None or order == 1:
            self.n_documents = X.shape[0]
            self.vocab = X.shape[1]
            self.mean = tl.mean(X, axis=0)

        if order is None or order == 2:
            self.second_order.fit(X - self.mean)

        if order is None or order == 3:
            X_whit = self.second_order.transform(X - self.mean)
            self.third_order.fit(X_whit,verbose=False)
            del X_whit

        del X


    def _partial_fit_first_order(self, X_batch):
        if self.mean is None:
            self.vocab = X_batch.shape[1]
            self.mean = tl.mean(X_batch, axis=0)
        else:
            self.mean = ((self.mean * self.n_documents) + tl.sum(X_batch, axis=0)) / (self.n_documents + X_batch.shape[0])
        self.n_documents += X_batch.shape[0]
        del X_batch

    def _partial_fit_second_order(self, X_batch):
        for j in range(0, len(X_batch), self.second_order.batch_size):
            y  = X_batch[j:j+self.second_order.batch_size]
            self.second_order.partial_fit(y - self.mean)
            del y
        del X_batch

    def _partial_fit_third_order(self, X_batch):
        for j in range(0, len(X_batch), self.third_order_cumulant_batch):
            y  = X_batch[j:j+self.third_order_cumulant_batch]
            self.third_order.partial_fit(y)
            del y
        del X_batch


[docs]
    def partial_fit(self, X_batch, batch_index, save_folder=None):
        """
        Update the word-topic distribution using a batch of documents. For a given batch, the
        first and second order cumulants need to be fit once, but the third order cumulant should
        be fit many times.

        Parameters
        ----------
        X_batch : tensor of shape (batch_size, self.vocab)
        batch_index : int
            index of the current batch.
            This is used to know whether to update the first and second moment or just whiten
        save_folder : str, default is None
            Folder in which to store the whitened batches.
            If None, the whitened batches will be recomputed at each iteration
            instead of being catched.
        """
        if not hasattr(self, "seen_batches"):
            self.seen_batches = dict()

        if batch_index in self.seen_batches:
            # We've seen the batch at least once
            if self.seen_batches[batch_index] != 0:
                # We already whitened it, just load that
                if save_folder:
                    save_file = self.seen_batches[batch_index]
                    X_batch = pickle.load(open(Path(save_folder).joinpath(save_file).as_posix(),'rb'))
                else:
                    X_batch = self.second_order.transform(X_batch - self.mean)

            else:
                # We only saw it once: that whitened version is not exact, recompute
                X_batch = self.second_order.transform(X_batch - self.mean)
                if save_folder is not None:
                    save_file = f'_whitened_batch_{batch_index}'
                    self.seen_batches[batch_index] = save_file
                    pickle.dump(X_batch, open(Path(save_folder).joinpath(save_file).as_posix(), 'wb'))
                else:
                    self.seen_batches[batch_index] = 1

            self._partial_fit_third_order(X_batch)

        else:
            # First time we see the batch: recompute the whitened version next time
            self._partial_fit_first_order(X_batch)
            self._partial_fit_second_order(X_batch)
            self.seen_batches[batch_index] = 0




[docs]
    def partial_fit_online(self, X_batch):
        """
        Update the word-topic distribution using a batch of documents in a fully online version. Meant for very large datasets,
        since we only do one gradient update for each batch in the third order cumulant calculation.

        Parameters
        ----------
        X_batch : tensor of shape (batch_size, self.vocab)
        """
        self._partial_fit_first_order(X_batch)
        self._partial_fit_second_order(X_batch)
        X_whit = self.second_order.transform(X_batch - self.mean)
        del X_batch
        self._partial_fit_third_order(X_whit)
        del X_whit


    def _unwhiten_factors(self):
        """Unwhitens self.third_order.factors_, then uncenters and unnormalizes"""
        factors_unwhitened = self.second_order.reverse_transform(self.third_order.factors_.T).T

        # Un-centers the data
        factors_unwhitened += tl.reshape(self.mean,(self.vocab,1))
        factors_unwhitened [factors_unwhitened  < 0.] = 0. # remove non-negative probabilities

        # Save unwhitened factors before postprocessing
        self.unwhitened_factors_raw_ = tl.copy(factors_unwhitened)

        # Smoothing
        factors_unwhitened *= (1. - self.smoothing)
        factors_unwhitened += (self.smoothing / factors_unwhitened.shape[1])

        # Calculate the eigenvalues from the whitened factors
        eig_vals = tl.tensor([tl.norm(k)**3 for k in self.third_order.factors_ ])
        alpha           = eig_vals**(-2)
        # Recover the topic weights
        alpha_norm      = (alpha / alpha.sum()) * self.alpha_0
        self.weights_   = tl.tensor(alpha_norm)

        # Normalize the factors

        factors_unwhitened /= factors_unwhitened.sum(axis=0)
        return factors_unwhitened

    @property
    def unwhitened_factors(self): # This doesnt work
        """Unwhitened learned factors of shape (n_topic, vocabulary_size)

        On the first call, this will compute and store the unwhitened factors.
        Subsequent calls will simply return the stored value.
        """
        if self.unwhitened_factors_ is None:
            self.unwhitened_factors_ = self._unwhiten_factors()
        return self.unwhitened_factors_


[docs]
    def transform(self, X=None, predict=False):
        """
        Transform the document-word matrix of a set of documents into a word-topic distribution and topic-distribution when predict=True.

        Parameters
        ----------
        X : tensor of shape (n_documents , self.vocab)
            set of documetns to predict topic distribution
        predict : indicate whether to return topic-document distribution and word-topic distribution or just word-topic distribution.
        """

        self.third_order.unwhitened_factors_ = self.unwhitened_factors
        if predict:
            predicted_topics = self.third_order.predict(X, self.unwhitened_factors_raw_, self.weights_)
            return predicted_topics

        return predicted_topics