Source code for openpathsampling.numerics.resampling_statistics

"""
Tools for resampling functions that output pandas.DataFrame objects.

Typically, you use these tools in 3 steps:

1. Create resampling groups of data, using, e.g., BlockResampling
2. Create a function that maps a list of input data into the desired output
   DataFrame
3. Create a ResamplingStatistics object using the `input` from step 1 and
   the `function` from step 2.
"""

import numpy as np
import itertools
import pandas as pd

import logging
logger = logging.getLogger(__name__)

# NOTE: there may be a better way to do this, by converting the results to
# numpy arrays and using the numpy functions. However, you'd have to be very
# careful that the rows and columns still correspond to the same things,
# i.e., that there's no permutation of index order. Using pandas protects us
# from such problems. Alternatively, it may be possible to bunch these into
# a pandas.Panel, and use pandas functions. All requires some performance
# testing; the advantange of the approach below is that when we want both
# mean and std, the mean can be calculated first and passed to the std to
# speed it up. (However, performance probably won't matter much for the
# things we'll be doing).

def mean_df(objects):
    """Basic calculation of mean (average) of a list of DataFrames.

    Parameters
    ----------
    objects : list of pandas.DataFrame
        the DataFrames to calculate the mean (NB: technically, these don't
        have to be DataFrames. They must be closed on the `sum` operation
        and over division by a float.

    Returns
    -------
    pandas.DataFrame :
        the mean of each element in the DataFrame
    """
    return sum(objects) / float(len(objects))

def std_df(objects, mean_x=None):
    """Standard deviation of a list of DataFrames.

    Parameters
    ----------
    objects : list of pandas.DataFrame
        the DataFrames to calculate the standard deviation
    mean_x : pandas.DataFrame
        (optional) pre-calculated mean of the `objects` list. If None
        (default), then the mean will be calculated.

    Returns
    -------
    pandas.DataFrame
        the standard deviation of each element in the dataframe
    """
    if mean_x is None:
        mean_x = mean_df(objects)
    n_obj = float(len(objects))
    sq = [o**2 for o in objects]
    variance = mean_df(sq) - mean_x**2
    return variance.applymap(np.sqrt)

[docs]class ResamplingStatistics(object): """ Contains and organizes resampled statistics. Attributes ---------- results : list of pandas.DataFrame the result DataFrames after applying the function mean : pandas.DataFrame DataFrame containing mean of the result DataFrames std : pandas.DataFrame DataFrame containing standard deviation of the result DataFrames Parameters ---------- function : callable the function to apply the statistics to; must take one item from the list `inputs` and return a pandas.DataFrame inputs : list each element of inputs is can be used as input to `function` """
[docs] def __init__(self, function, inputs): self.function = function self.inputs = inputs self.results = [self.function(inp) for inp in self.inputs] self._mean = None self._std = None self._sorted_series = None
@property def mean(self): if self._mean is None: self._mean = mean_df(self.results) return self._mean @property def std(self): if self._std is None: self._std = std_df(self.results, mean_x=self.mean) return self._std @property def index(self): return self.mean.index @property def columns(self): return self.mean.columns @property def sorted_series(self): if self._sorted_series is None: self._sorted_series = { loc: pd.Series(df.loc[loc] for df in self.results).sort_values() for loc in itertools.product(self.index, self.columns) } return self._sorted_series def percentile(self, percent): """Percentile, using Nearest Rank method. Parameters ---------- percent : float the percentile desired Returns ------- pd.DataFrame the DataFrame nearest to that percentile """ n_entries = len(self.results) rank = min(int(percent / 100.0 * n_entries), n_entries - 1) df = pd.DataFrame(index=self.index, columns=self.columns) for idx in self.index: for col in self.columns: df.loc[idx, col] = self.sorted_series[(idx, col)].iloc[rank] return df
[docs]class BlockResampling(object): """Select samples according to block resampling. If neither n_blocks nor n_per_block are set (as is the default behavior) then n_blocks=20 is used. The blocks are always of the same size, if the number of samples doesn't divide evenly, then the extra samples are placed in the `unassigned` attribute. Parameters ---------- all_samples : list list of all samples n_blocks : int number of blocks (resampling sets) n_per_block : int number of samples per block """
[docs] def __init__(self, all_samples, n_blocks=None, n_per_block=None): self.n_total_samples = len(all_samples) if n_blocks is None and n_per_block is None: n_blocks = 20 if n_blocks is None and n_per_block is not None: n_blocks = self.n_total_samples // n_per_block elif n_blocks is not None and n_per_block is None: n_per_block = self.n_total_samples // n_blocks self.n_blocks = n_blocks self.n_per_block = n_per_block self.blocks = [all_samples[i*n_per_block:(i+1)*n_per_block] for i in range(n_blocks)] self.unassigned = all_samples[n_blocks*n_per_block:] self.n_resampled = self.n_total_samples - len(self.unassigned)