Source code for openpathsampling.numerics.resampling_statistics

"""
Tools for resampling functions that output pandas.DataFrame objects.

Typically, you use these tools in 3 steps:

1. Create resampling groups of data, using, e.g., BlockResampling
2. Create a function that maps a list of input data into the desired output
   DataFrame
3. Create a ResamplingStatistics object using the `input` from step 1 and
   the `function` from step 2.
"""

import numpy as np
import itertools
import pandas as pd

import logging
logger = logging.getLogger(__name__)

# NOTE: there may be a better way to do this, by converting the results to
# numpy arrays and using the numpy functions. However, you'd have to be very
# careful that the rows and columns still correspond to the same things,
# i.e., that there's no permutation of index order. Using pandas protects us
# from such problems. Alternatively, it may be possible to bunch these into
# a pandas.Panel, and use pandas functions. All requires some performance
# testing; the advantange of the approach below is that when we want both
# mean and std, the mean can be calculated first and passed to the std to
# speed it up. (However, performance probably won't matter much for the
# things we'll be doing).

def mean_df(objects):
    """Basic calculation of mean (average) of a list of DataFrames.

    Parameters
    ----------
    objects : list of pandas.DataFrame
        the DataFrames to calculate the mean (NB: technically, these don't
        have to be DataFrames. They must be closed on the `sum` operation
        and over division by a float.

    Returns
    -------
    pandas.DataFrame :
        the mean of each element in the DataFrame
    """
    return sum(objects) / float(len(objects))

def std_df(objects, mean_x=None):
    """Standard deviation of a list of DataFrames.

    Parameters
    ----------
    objects : list of pandas.DataFrame
        the DataFrames to calculate the standard deviation
    mean_x : pandas.DataFrame
        (optional) pre-calculated mean of the `objects` list. If None
        (default), then the mean will be calculated.

    Returns
    -------
    pandas.DataFrame
        the standard deviation of each element in the dataframe
    """
    if mean_x is None:
        mean_x = mean_df(objects)
    n_obj = float(len(objects))
    sq = [o**2 for o in objects]
    variance = mean_df(sq) - mean_x**2
    return variance.applymap(np.sqrt)


[docs]
class ResamplingStatistics(object):
    """
    Contains and organizes resampled statistics.

    Attributes
    ----------
    results : list of pandas.DataFrame
        the result DataFrames after applying the function
    mean : pandas.DataFrame
        DataFrame containing mean of the result DataFrames
    std : pandas.DataFrame
        DataFrame containing standard deviation of the result DataFrames

    Parameters
    ----------
    function : callable
        the function to apply the statistics to; must take one item from
        the list `inputs` and return a pandas.DataFrame
    inputs : list
        each element of inputs is can be used as input to `function`
    """

[docs]
    def __init__(self, function, inputs):
        self.function = function
        self.inputs = inputs
        self.results = [self.function(inp) for inp in self.inputs]
        self._mean = None
        self._std = None
        self._sorted_series = None


    @property
    def mean(self):
        if self._mean is None:
            self._mean  = mean_df(self.results)
        return self._mean

    @property
    def std(self):
        if self._std is None:
            self._std = std_df(self.results, mean_x=self.mean)
        return self._std

    @property
    def index(self):
        return self.mean.index

    @property
    def columns(self):
        return self.mean.columns

    @property
    def sorted_series(self):
        if self._sorted_series is None:
            self._sorted_series = {
                loc: pd.Series(df.loc[loc]
                               for df in self.results).sort_values()
                for loc in itertools.product(self.index, self.columns)
            }
        return self._sorted_series

    def percentile(self, percent):
        """Percentile, using Nearest Rank method.

        Parameters
        ----------
        percent : float
            the percentile desired

        Returns
        -------
        pd.DataFrame
            the DataFrame nearest to that percentile
        """
        n_entries = len(self.results)
        rank = min(int(percent / 100.0 * n_entries), n_entries - 1)
        df = pd.DataFrame(index=self.index, columns=self.columns)
        for idx in self.index:
            for col in self.columns:
                df.loc[idx, col] = self.sorted_series[(idx, col)].iloc[rank]
        return df



[docs]
class BlockResampling(object):
    """Select samples according to block resampling.

    If neither n_blocks nor n_per_block are set (as is the default
    behavior) then n_blocks=20 is used. The blocks are always of the same
    size, if the number of samples doesn't divide evenly, then the extra
    samples are placed in the `unassigned` attribute.

    Parameters
    ----------
    all_samples : list
        list of all samples
    n_blocks : int
        number of blocks (resampling sets)
    n_per_block : int
        number of samples per block
    """

[docs]
    def __init__(self, all_samples, n_blocks=None, n_per_block=None):
        self.n_total_samples = len(all_samples)
        if n_blocks is None and n_per_block is None:
            n_blocks = 20
        if n_blocks is None and n_per_block is not None:
            n_blocks = self.n_total_samples // n_per_block
        elif n_blocks is not None and n_per_block is None:
            n_per_block = self.n_total_samples // n_blocks

        self.n_blocks = n_blocks
        self.n_per_block = n_per_block
        self.blocks = [all_samples[i*n_per_block:(i+1)*n_per_block]
                       for i in range(n_blocks)]
        self.unassigned = all_samples[n_blocks*n_per_block:]
        self.n_resampled = self.n_total_samples - len(self.unassigned)