Source code for pyllars.stats_utils

"""
This module contains helpers for various statistical calculations.
"""
import logging
logger = logging.getLogger(__name__)

from enum import Enum
import numpy as np
import pandas as pd
import scipy.stats
import typing

from typing import Any, Callable, Iterable, Optional, Tuple

numpy_size = typing.Optional[typing.Union[int, typing.Tuple[int]]]
sampling_return_type = typing.Union[np.ndarray, int]

###
# KL-divergence helpers
###
[docs]def calculate_symmetric_kl_divergence(
        p:Any,
        q:Any,
        calculate_kl_divergence:Callable) -> float:
    """ Calculates the symmetric KL-divergence between distributions p and q
    
    In particular, this function defines the symmetric KL-divergence to be\:
    
    .. math:: D_{sym}(p||q) \:= \\frac{D(p||q) + D(q||p)}{2}
        
    Parameters
    ----------
    {p,q} : typing.Any
        A representation of a distribution that can be used by the 
        function `calculate_kl_divergence`
            
    calculate_kl_divergence : typing.Callable
        A function the calculates the KL-divergence between :math:`p`
        and :math:`q`

    Returns
    -------
    symmetric_kl : float
        The symmetric KL-divergence between :math:`p` and :math:`q`
    """
    kl_1 = calculate_kl_divergence(p, q)
    kl_2 = calculate_kl_divergence(q, p)
    symmetric_kl = (kl_1 + kl_2) / 2
    return symmetric_kl

[docs]def calculate_univariate_gaussian_kl(
        mean_p_var_p:Tuple[float,float],
        mean_q_var_q:Tuple[float,float]) -> float:
    """ Calculate the (asymmetric) KL-divergence between the univariate
    Gaussian distributions :math:`p` and :math:`q`
    
    That is, this calculates KL(p||q).

    **N.B.** This function uses the variance!

    **N.B.** The parameters for each distribution are passed as pairs 
    for easy use with `calculate_symmetric_kl_divergence`.

    See, for example, [1]_ for the formula.

    Parameters
    ----------
    {mean_p_var_p,mean_q_var_q} : Typing.Tuple[float,float]
        The parameters of the distributions.

    Returns
    -------
    kl_divergence : float
        The KL divergence between the two distributions.
            
    References
    ----------
    .. [1] Penny, W. "KL-Divergences of Normal, Gamma, Dirichlet and Wishart densities." Wellcome Department of Cognitive Neurology, University College London, 2001.
    """

    (mean_p, var_p) = mean_p_var_p
    (mean_q, var_q) = mean_q_var_q

    t_1 = 0.5 * (np.log(var_q) - np.log(var_p))
    t_2 = np.log(mean_q*mean_q + mean_p*mean_p + var_p - 2*mean_q*mean_p)
    t_3 = np.log(2*var_q)
    kl = t_1 - 0.5 + np.exp(t_2 - t_3)
    return kl

[docs]def symmetric_entropy(p, q) -> float:
    """ Calculate the symmetric :func:`scipy.stats.entropy`. """
    return calculate_symmetric_kl_divergence(p, q, scipy.stats.entropy)

[docs]def symmetric_gaussian_kl(p, q) -> float:
    """ Calculate the symmetric :func:`pyllars.stats_utils.calculate_univariate_gaussian_kl`. """
    return calculate_symmetric_kl_divergence(
        p, q, 
        calculate_univariate_gaussian_kl
    )

###
# Frequentist statistics
###
[docs]def get_population_statistics(
        subpopulation_sizes:np.ndarray,
        subpopulation_means:np.ndarray,
        subpopulation_variances:np.ndarray) -> Tuple[float,float,float,float]:
    """ Calculate the population size, mean and variance based on subpopulation statistics
    
    This code is based on "Chap"'s answer here: https://stats.stackexchange.com/questions/30495
        
    This calculation seems to underestimate the variance relative
    to :func:`numpy.var` on the entire dataset (determined by simulation). This may
    somehow relate to "biased" vs. "unbiased" variance estimates (basically,
    whether to subtract 1 from the population size). Still, naive
    approaches to correct for that do not produce variance estimates which
    exactly match those from :func:`numpy.var`.
    
    Parameters
    ----------
    subpopulation_{sizes,means,variances} : numpy.ndarray
        The subpopulation sizes, means, and variances, respectively. These should
        all be the same size.
        
    Returns
    -------
    population_{size,mean,variance,std} : float
        The respective statistics about the entire population
    """
    
    n = np.sum(subpopulation_sizes)
    _subpopulation_total_sum = np.multiply(subpopulation_sizes, subpopulation_means)
    _population_total_sum = np.sum(_subpopulation_total_sum)

    population_mean = 1 / n * _population_total_sum
    
    _subpopulation_variance = np.multiply(subpopulation_sizes - 1, subpopulation_variances)
    _subpopulation_variance_sum = np.sum(_subpopulation_variance)
    
    _subpopulation_mean_difference = (subpopulation_means - population_mean)**2
    _subpopulation_means_product = np.multiply(subpopulation_sizes, _subpopulation_mean_difference)
    _subpopulation_means_sum = np.sum(_subpopulation_means_product)
    
    population_variance = (_subpopulation_means_sum + _subpopulation_variance_sum) / (n-1)
    
    population_std = np.sqrt(population_variance)
    
    return n, population_mean, population_variance, population_std

###
# Bayesian hypothesis testing
###

[docs]def bayesian_proportion_test(
        x:Tuple[int,int],
        n:Tuple[int,int],
        prior:Tuple[float,float]=(0.5,0.5),
        prior2:Optional[Tuple[float,float]]=None, 
        num_samples:int=1000,
        seed:int=8675309) -> Tuple[float,float,float]:
    """ Perform a Bayesian test to identify significantly different proportions.
    
    This test is based on a beta-binomial conjugate model. It uses Monte Carlo
    simulations to estimate the posterior of the difference between the
    proportions, as well as the likelihood that :math:`\pi_1 > \pi_2` (where
    :math:`\pi_i` is the likelihood of success in sample :math:`i`).
    
    Parameters
    ----------
    x : typing.Tuple[int,int]
        The number of successes in each sample
        
    n : typing.Tuple[int,int]
        The number of trials in each sample
        
    prior : typing.Tuple[float,float]
        The parameters of the beta distribution used as the prior in the conjugate
        model for the first sample.
        
    prior2 : typing.Optional[typing.Tuple[float,float]]
        The parameters of the beta distribution used as the prior in the conjugate
        model for the second sample. If this is not specified, then `prior` is used.
        
    num_samples : int
        The number of simulations
        
    seed : int
        The seed for the random number generator
        
    Returns
    -------
    difference_{mean,var} : float
        The posterior mean and variance of the difference in the likelihood of success 
        in the two samples. A negative mean indicates that the likelihood in sample 2 
        is higher.
        
    p_pi_1_greater : float
        The probability that :math:`\pi_1 > \pi_2`
    """
    
    # copy over the prior if not specified for sample 2
    if prior2 is None:
        prior2 = prior
        
    # check the bounds
    if len(x) != 2:
        msg = "[bayesian_proportion_test]: please ensure x has exactly two elements"
        raise ValueError(msg)
    if len(n) != 2:
        msg = "[bayesian_proportion_test]: please ensure n has exactly two elements"
        raise ValueError(msg)
    if len(prior) != 2:
        msg = "[bayesian_proportion_test]: please ensure prior has exactly two elements"
        raise ValueError(msg)
    if len(prior2) != 2:
        msg = "[bayesian_proportion_test]: please ensure prior2 has exactly two elements"
        raise ValueError(msg)
    
    # set the seed
    if seed is not None:
        np.random.seed(seed)
    
    # perform the test
    a = prior[0]+x[0]
    b = prior[0]+n[0]-x[0]
    s1_posterior_samples = scipy.stats.beta.rvs(a, b, size=num_samples)

    a = prior[1]+x[1]
    b = prior[1]+n[1]-x[1]
    s2_posterior_samples = scipy.stats.beta.rvs(a, b, size=num_samples)
    
    diff_posterior_samples = s1_posterior_samples - s2_posterior_samples
    diff_posterior_mean = np.mean(diff_posterior_samples)
    diff_posterior_var = np.var(diff_posterior_samples)
    
    p_pi_1_greater = sum(s1_posterior_samples > s2_posterior_samples) / num_samples
    
    return diff_posterior_mean, diff_posterior_var, p_pi_1_greater

[docs]def normal_inverse_chi_square(m, k, r, s, size=1):
    """ Sample from a normal-inverse-chi-square distribution with parameters
    m, k, r, s.

    This distribution is of interest because it is a conjugate prior for
    Gaussian observations.

    Sampling is described in: https://www2.stat.duke.edu/courses/Fall10/sta114/notes15.pdf
    
    Parameters
    ----------
    m, k: float
        m is the mean of the sampled mean; k relates to the variance of the
        sampled mean.

    r, s: float
        r is the degrees of freedom in the chi-square distribution from which
        the variance is samples; s is something like a scaling factor.

    size: int or tuple of ints, or None
        Output shape. This shares the semantics as other numpy sampling functions.
    """
    
    x = np.random.chisquare(r, size=size)
    v = (r*s)/x
    w = np.random.normal(m, v/k)
        
    return w, v

[docs]def bayesian_means_test(
        x1:np.ndarray,
        x2:np.ndarray,
        use_jeffreys_prior:bool=True,
        prior1:Optional[Tuple[float,float,float,float]]=None,
        prior2:Optional[Tuple[float,float,float,float]]=None,
        num_samples:int=1000,
        seed:int=8675309) -> Tuple[float,float,float]:
    """ Perform a Bayesian test to identify significantly different means.
    
    The test is based on a Gaussian conjugate model. (The normal-inverse-chi-square
    distribution is the prior.) It uses Monte Carlo simulation to estimate the
    posterior of the difference between the means of the populations, under the
    (probably dubious) assumption that the observations are Gaussian distributed.
    It also estimates the likelihood that :math:`\mu_1 > \mu_2`, where :math`\mu_i`
    is the mean of each sample.
    
    Parameters
    ----------
    x{1,2} : numpy.ndarray
        The observations of each sample
        
    use_jeffreys_prior : bool
        Whether to use the Jeffreys prior. For more details, see:
        
            Murphy, K. Conjugate Bayesian analysis of the Gaussian distribution. 
            Technical report, 2007.
            
        Briefly, the Jeffreys prior is: :math:`(\\text{sample mean}, n, n-1,
        \\text{sample variance})`, according to a
        :func:`pyllars.stats_utils.normal_inverse_chi_square` distribution.
            
    prior{1,2} : typing.Optional[typing.Tuple[float,float,float,float]]
        If the Jeffreys prior is not used, then these parameters are used as the
        priors for the normal-inverse-chi-square. If only prior1 is given, then
        those values are also used for prior2, where prior_i is taken as the prior
        for x_i.
        
    num_samples : int
        The number of simulations
        
    seed : int
        The seed for the random number generator
        
    Returns
    -------
    difference_{mean,var} : float
        The posterior mean and variance of the difference in the mean of the two
        samples. A negative difference_mean indicates that the mean of x2 is 
        higher.
        
    p_m1_greater : float
        The probability that :math:`\mu_1 > \mu_2`

    """
    
    if use_jeffreys_prior:
        prior1 = (np.mean(x1), len(x1), int(len(x1)-1), np.var(x1))
        prior2 = (np.mean(x2), len(x2), int(len(x2)-1), np.var(x2))
        
    elif prior2 is None:
        prior2 = prior1
        
    if prior1 is None:
        msg = ("[bayesian_means_test]: either the Jeffreys prior must be "
               "used, or the parameters of the normal-inverse-chi-square "
               "distributions must be given.")
        raise ValueError(msg)
        
    if len(prior1) != 4:
        msg = ("[bayesian_means_test]: please ensure prior1 has exactly four "
            "elements")
        raise ValueError(msg)
    if len(prior2) != 4:
        msg = ("[bayesian_means_test]: please ensure prior2 has exactly four "
            "elements")
        raise ValueError(msg)
        
    if seed is not None:
        np.random.seed(seed)
        
    # now, sample from the posterior distributions
    s1 = normal_inverse_chi_square(*prior1, size=num_samples)
    s2 = normal_inverse_chi_square(*prior2, size=num_samples)

    w1_posterior_samples, v1_posterior_samples = s1
    w2_posterior_samples, v2_posterior_samples = s2
    
    diff_posterior_samples = w1_posterior_samples - w2_posterior_samples
    diff_posterior_mean = np.mean(diff_posterior_samples)
    diff_posterior_var = np.var(diff_posterior_samples)
    
    m_m1_greater = w1_posterior_samples > w2_posterior_samples
    p_m1_greater = np.sum(m_m1_greater) / num_samples

    return (diff_posterior_mean, diff_posterior_var), p_m1_greater

###
# Model parameter estimation
###

[docs]def get_categorical_mle_estimates(
        observations:Iterable[int],
        cardinality:Optional[int]=None,
        use_laplacian_smoothing:bool=False,
        base_1:bool=False) -> np.ndarray:
    """ Calculate the MLE estimates for the categorical observations
    
    Parameters
    ----------
    observations : typing.Iterable[int]
        The observed values. These are taken to already be "label encoded",
        so they should be integers in [0,cardinality).
        
    cardinality : typing.Optional[int]
        The cardinality of the categorical variable. If `None`, then this
        is taken as the number of unique values in `observations`.
        
    use_laplacian_smoothing : bool
        Whether to use Laplacian ("add one") smoothing for the estimates.
        This can also be interpreted as a symmetric Dirichlet prior with 
        a concentration parameter of 1.

    base_1 : bool
        Whether the observations are base 1. If so, then the range is taken
        as [1, cardinality].
        
    Returns
    -------
    mle_estimates : numpy.ndarray
        The estimates. The size of the array is `cardinality`.
    """
    if base_1:
        observations = np.array(observations) - 1

    indices, counts = np.unique(observations, return_counts=True)
    
    if cardinality is None:
        cardinality = np.max(indices) + 1
        
    mle_estimates = np.zeros(shape=cardinality)
    
    for i,c in zip(indices, counts):
        mle_estimates[i] += c
        
    if use_laplacian_smoothing:
        mle_estimates += 1
        
    mle_estimates = mle_estimates / np.sum(mle_estimates)
    return mle_estimates


[docs]class polynomial_order(Enum):
    linear = 1
    quadratic = 2

[docs]def fit_with_least_squares(
        x:np.ndarray,
        y:np.ndarray,
        w:Optional[np.ndarray]=None,
        order=polynomial_order.linear) -> Tuple[float,float,float,float]:
    """ Fit a polynomial relationship between `x` and `y`.
    
    Optionally, the values can be weighted.

    Parameters
    ----------
    {x,y} : numpy.ndarray
        The input arrays

    w : numpy.ndarray
        A weighting of each of the (x,y) pairs for regression

    order : polynomial_order
        The order of the fit

    Returns
    -------
    {intercept,slope,power} : float
        The coefficients of the fit. power is 0 if the order is linear.

    r_sqr : float
        The coefficient of determination
    """

    # Calculate trendline
    coeffs = np.polyfit(x, y, order.value, w=w)

    intercept = coeffs[-1]
    slope = coeffs[-2]
    power = coeffs[0] if order != polynomial_order.linear else 0 

    #Calculate R Squared
    p = np.poly1d(coeffs)

    ybar = np.sum(y) / len(y)
    ssreg = np.sum((p(x) - ybar) ** 2)
    sstot = np.sum((y - ybar) ** 2)
    r_sqr = ssreg / sstot
    

    return intercept, slope, power, r_sqr

###
# Sampling helpers
###
[docs]def sample_dirichlet_multinomial(dirichlet_alphas:np.ndarray, num_samples:int) -> np.ndarray:
    """ Sample from a Dirichlet-multinomial distribution

    Parameters
    ----------
    dirichlet_alphas : numpy.ndarray
        The concentration parameters for the Dirichlet distribution

    num_samples : int
        The number of samples to draw for the multinomial

    Returns
    -------
    counts : np.ndarray
        The number of counts from each level in the multinomial. This will be
        the same size as `dirichlet_alphas`.
    """
    pvals = np.random.dirichlet(dirichlet_alphas)
    sampled_counts = np.random.multinomial(n=num_samples, pvals=pvals)
    return sampled_counts
	

[docs]def sample_beta_binomial(
        alpha:float,
        beta:float,
        n_samples:int=1,
        size:numpy_size=None) -> sampling_return_type:
    """ Sample from a beta-binomial distribution

    Parameters
    ----------
    alpha, beta : float
        The alpha and beta shape parameters of the beta distribution. They must
        be positive.

    n_samples : int
        The number of samples to draw from this distribution

    size : typing.Optional[typing.Union[int, typing.Tuple[int]]]
        The output shape of the beta distribution. Please see the
        :obj:`numpy.random.beta` documentation for details.

    Returns
    -------
    samples : typing.Union[int, numpy.ndarray]
        The samples from the beta-binomial distribution

    Note
    ----
    In order to see the outcome of multiple Bernoulli samples, the `size`
    parameter should be set to the number of desired samples, while `n_samples`
    should be set to `1`.
    """
    p = np.random.beta(alpha, beta, size=size)
    r = np.random.binomial(n_samples, p)
    return r

[docs]def sample_gamma_poisson(
        mean:float,
        var:float,
        size:numpy_size=None) -> sampling_return_type:
    """ Sample from a gamma-Poisson distribution

    Parameters
    ----------
    mean, var : float
        The respective parameters of the Gamma distribution. They must be
        non-zero, and the variance should be positive.

        **N.B.** The gamma is usually parameterized by a "shape" and "scale" 
        (usually called :math:`k` and :math:`\theta`, respectively) or a 
        "shape" and "rate" (usually called :math:`\alpha` and :math:`beta`,
        respectively). This implementation is based on simple arithmetic
        operations to derive the mean and variance from the shape and scale.

        Specifically, we have the following relationships:

        .. math:

            k = \frac{\mu^2}{\sigma^2}
            \theta = \frac{\sigma^2}{\mu}

    size : typing.Optional[typing.Union[int, typing.Tuple[int]]]
        The output shape of the beta distribution. Please see the
        :obj:`numpy.random.beta` documentation for details.

    Returns
    -------
    samples : typing.Union[int, numpy.ndarray]
        The samples from the gamma-Poisson distribution

    Note
    ----
    In order to see the outcome of multiple Poisson samples, the `size`
    parameter should be set to the number of desired samples.
    """
    shape = (mean * mean) / var
    scale = var / mean
    lam = np.random.gamma(shape, scale, size=size)
    r = np.random.poisson(lam)
    return r