Source code for openpathsampling.numerics.wham

#!/usr/bin/env python

from __future__ import print_function

# wham.py
# The Weighted Histogram Analysis Method (WHAM) for combining histograms
# from several files.
import pandas as pd
import numpy as np
import sys
import logging
logger = logging.getLogger(__name__)



[docs]
class WHAM(object):
    """
    Weighted Histogram Analysis Method

    Notes
    -----
        Several parts of the docstrings mention F&S, which is intended to
        refer the reader to reference [1]_, in particular pages 184-187 in
        the 2nd edition (section called "Self-Consistent Histogram Method").

        Other terminology: n_hists refers to the number of histograms,
        n_bins refers to the number of bins per histogram. Thus the input is
        a matrix of n_bins rows and n_hists columns.


    References
    ----------
    .. [1] Daan Frenkel and Berend Smit. Understanding Molecular Simulation:
       From Algorithms to Applications. 2nd Edition. 2002.

    Parameters
    ----------
    tol : float
        tolerance for convergence or equality. default 10e-10
    max_iter : int
        maximum number of iterations. Default 1000000
    cutoff : float
        windowing cutoff, as fraction of maximum value. Default 0.05

    Attributes
    ----------
    sample_every : int
        frequency (in iterations) to report debug information
    """

[docs]
    def __init__(self, tol=1e-10, max_iter=1000000, cutoff=0.05,
                 interfaces=None):
        self.tol = tol
        self.max_iter = max_iter
        self.cutoff = cutoff
        self.interfaces = interfaces

        self.sample_every = max_iter + 1
        self._float_format = "10.8"
        self.lnZ = None


    @property
    def float_format(self):
        """Float output format. Example: 10.8 (default)"""
        return lambda x: "{:" + self._float_format + "f}".format(x)

    @float_format.setter
    def float_format(self, value):
        self._float_format = value

    def load_files(self, fnames):  # pragma: no cover
        """Load a file or files into the internal structures.

        Requires either pandas or something else with pandas-like read_table
        and concat functions.

        Parameters
        ----------
        fnames : string or list of string
            file(s) to read in; each gives a column of the dataframe, with
            common indexes (keys)

        Returns
        -------
        pd.DataFrame
            dataframe with each input histogram in a column
        """
        # TODO: add some validation of formatting
        frames = []
        try:
            for fname in fnames:
                frames.append(pd.read_table(fname, index_col=0, sep=" ",
                                            usecols=[0, 1], header=None))
        except TypeError:
            frames.append(pd.read_table(fnames, index_col=0, sep=" ",
                                        usecols=[0, 1], header=None))
            fnames = [fnames]
        df = pd.concat(frames, axis=1)
        df.columns = fnames
        return df

    def prep_reverse_cumulative(self, df, cutoff=None, tol=None):
        """
        Created cleaned dataframe for further analysis.

        This version assumes that the input is the result of a reversed
        cumulative histogram. That means that it cleans leading input where
        the initial

        Parameters
        ----------
        df : pandas.DataFrame
            input dataframe
        cutoff : float
            windowing cutoff, as fraction of maximum value
        tol : float
            tolerance for two values being "equal"


        Returns
        -------
        pandas.DataFrame
            version of the dataframe with leading constant values removed
            and without anything under the cutoff. This is often referred to
            as the "cleaned input dataframe" in other functions
        """
        if cutoff is None:
            cutoff = self.cutoff
        if tol is None:
            tol = self.tol

        # clear things that don't pass the cutoff
        hist_max = df.max(axis=0)
        raw_cutoff = cutoff*hist_max
        cleaned_df = df.apply(
            lambda s: [x if x > raw_cutoff[s.name] else 0.0 for x in s]
        )

        if self.interfaces is not None:
            # use the interfaces values to set anything before that value to
            # zero
            if type(self.interfaces) is not pd.Series:
                self.interfaces = pd.Series(data=self.interfaces,
                                            index=df.columns)
            greater_almost_equal = lambda a, b: (a >= b or
                                                 abs(a - b) < 10e-10)
            cleaned_df = cleaned_df.apply(
                lambda s: [
                    (
                        s.iloc[i]
                        if greater_almost_equal(s.index[i],
                                                self.interfaces[s.name])
                        else 0.0
                    )
                    for i in range(len(s))
                ]
            )
        else:
            # clear duplicates of leading values
            test_f = lambda val1, val2, val_max: (
                abs(val1 - val2) > tol or abs(val1 - val_max) > tol
            )
            cleaned_df = cleaned_df.apply(
                lambda s: [
                    s.iloc[i]
                    if test_f(s.iloc[i], s.iloc[i+1], s.max()) else 0.0
                    for i in range(len(s)-1)
                ] + [s.iloc[-1]]
            )
        return cleaned_df

    def unweighting_tis(self, cleaned_df):
        """
        Calculates the "unweighting" values for each histogram.

        In TIS, this is just 1 if there is a non-zero entry in cleaned_df,
        and 0 otherwise.

        (NB: it isn't quite clear to me why this is a matrix instead of a
        vector, but the previous code accounted for a dependence on the
        bin of the histogram)

        Parameters
        ----------
        cleaned_df : pandas.DataFrame
            cleaned input dataframe

        Returns
        -------
        pandas.DataFrame
            unweighting values for the input dataframe
        """
        unweighting = cleaned_df.copy().applymap(
            lambda x: 1.0 if x > 0.0 else 0.0
        )
        return unweighting

    def sum_k_Hk_Q(self, cleaned_df):
        r"""Sum over histograms for each histogram bin. Length is n_bins

        Called ``sum_hist`` in other codes, or :math:`\sum_k H_k(Q)` in F&S.
        This is the sum over histograms of values for a given histogram bin.

        Parameters
        ----------
        cleaned_df : pandas.DataFrame
            cleaned input dataframe

        Returns
        -------
        pandas.Series
            sum over histograms for each bin (length is n_bins)
        """
        return cleaned_df.sum(axis=1)

    def n_entries(self, cleaned_df):
        """List of counts of entries per histogram. Length is n_hists

        The list of counts of entries. In other codes, this is `nt`. In F&S,
        this is :math:`M_k`.

        Parameters
        ----------
        cleaned_df : pandas.DataFrame
            cleaned input dataframe

        Returns
        -------
        pandas.Series
            List of counts of entries per histogram (length n_hists)
        """
        return cleaned_df.sum(axis=0)

    def weighted_counts_tis(self, unweighting, n_entries):
        """
        Product of unweighting and n_entries.

        In F&S, this is :math:`e^{-\\beta W_k} M_k`. This value appears as
        part of a sum in the WHAM iteration equation (F&S 2nd edition
        Eq. 7.3.10). The product can be pre-calculated, which is what we do
        here.

        As for this being a matrix (not a vector), see note on
        :meth:`.unweighting`.

        Returns
        -------
        pandas.DataFrame
            weighted counts matrix, size n_hists by n_dims
        """
        weighted_counts = unweighting.apply(lambda s: [x * n_entries[s.name]
                                                       for x in s])
        return weighted_counts

    def generate_lnZ(self, lnZ, unweighting, weighted_counts, sum_k_Hk_Q,
                     tol=None):
        r"""
        Perform the WHAM iteration to estimate ln(Z_i) for each histogram.

        Parameters
        ----------
        lnZ : pandas.Series, one per histogram (length n_hists)
            initial guess for ln(Z_i) for each histogram i
        unweighting : pandas.DataFrame, n_bins by n_hists
            the unweighting matrix for each histogram point. For TIS, this
            is 1 if the (cleaned) DF has an entry; 0 otherwise. In F&S, this
            is :math:`\exp(-\\beta W_i)`. See :meth:`.unweighting`.
        weighted_counts : pandas.DataFrame, n_bins by n_hists
            the weighted matrix multiplied by the counts per histogram. See
            :meth:`.weighted_counts_tis`.
        sum_k_Hk_Q : pandas.Series, one per bin (length n_bins)
            Sum over histograms for each histogram bin. See
            :meth:`.sum_k_Hk_Q`.
        tol : float
            convergence tolerance

        Returns
        -------
        pandas.Series
            the resulting WHAM calculation for ln(Z_i) for each histogram i
        """
        if tol is None:
            tol = self.tol
        diff = self.tol + 1  # always start above the tolerance
        iteration = 0
        hists = weighted_counts.columns
        # TODO: probably faster if we make wc this a sparse matrix
        wc = weighted_counts.values
        unw = unweighting.values
        lnZ_old = pd.Series(data=lnZ, index=hists)
        Z_new = pd.Series(index=hists, dtype='float64')
        sum_k_Hk_byQ = sum_k_Hk_Q.values
        while diff > tol and iteration < self.max_iter:
            Z_old = np.exp(lnZ_old)
            reciprocal_Z_old = (1.0 / Z_old).values
            for i in range(len(hists)):
                #############################################################
                # this is equation 7.3.10 in F&S
                # Z_i^{(new)} =
                #    \int \dd{Q} w_{i,Q}
                #    \times \frac{\sum_{j=1}^n H_j(Q)}
                #                {\sum_{k=1}^n w_{k,Q} M_k / Z_k^{(old)}}
                # where F&S explicitly use w_{i,Q} = e^{-\beta W_i}
                #
                # Matching terms from F&S to our variables:
                #   w_i = w_{i,Q} = $e^{-\beta W_i}$
                #       * this is a column (len == n_bins) from a matrix
                #         size n_hists \times n_bins
                #       * from "unweighting", which is Boltzmann in umbrella
                #         sampling (F&S), but 1 or 0 in TIS
                #       * see unweighting_tis: not entirely clear why this
                #         is a matrix, not an vector length n_hists
                #   sum_k_Hk_byQ = $\sum_{j=1}^n H_j(Q)$
                #       * this is a function of Q, thus len == n_bins
                #   wc = w_{k,Q} * M_k = $e^{-\beta W_k} M_k$
                #       * note that this is element-wise multiplication
                #       * matrix, size n_hists \times n_bins
                #   reciprocal_Z_old = $1/Z_k^{(old)}$
                #       * vector, len == n_hists
                #
                # Note that we do all of this with matrix/vector
                # multiplication, which numpy can do very quickly.
                #############################################################

                w_i = unw[:, i]

                # numerator: w_{i,Q} * sum_k_Hk_byQ
                numerator_byQ = np.multiply(w_i, sum_k_Hk_byQ)

                # denominator: wc * Z^{-1}
                sum_over_Z_byQ = wc.dot(reciprocal_Z_old)

                # divide each entry, and add them (integrate over Q in F&S)
                # we intentially allow invalid (0/0) to give NaN; gets
                # removed by using the np.nansum)
                with np.errstate(divide='ignore', invalid='ignore'):
                    addends_k = np.divide(numerator_byQ, sum_over_Z_byQ)
                Z_new[hists[i]] = np.nansum(addends_k)

            lnZ_new = np.log(Z_new)

            iteration += 1
            diff = self.get_diff(lnZ_old, lnZ_new, iteration)
            lnZ_old = lnZ_new - lnZ_new.iloc[0]

        logger.info("iterations=" + str(iteration) + " diff=" + str(diff))
        logger.info("       lnZ=" + str(lnZ_old))
        self.convergence = (iteration, diff)
        return lnZ_old

    def get_diff(self, lnZ_old, lnZ_new, iteration):
        """Calculate the difference for this iteration.

        Also outputs debug information, if desired.

        Parameters
        ----------
        lnZ_old : pandas.Series
            previous value of ln(Z_i)
        lnZ_new : pandas.Series
            new value of ln(Z_i)
        iteration : int
            iteration number

        Returns
        -------
        float
            difference between old and new to use for convergence testing
        """
        # get error
        diff = 0
        diff = sum(abs(lnZ_old - lnZ_new))
        # check status (mainly for debugging)
        if (iteration % self.sample_every == 0):  # pragma: no cover
            logger.debug("niteration = " + str(iteration))
            logger.debug("  diff = " + str(diff))
            logger.debug("   lnZ = " + str(lnZ_old))
            logger.debug("lnZnew = " + str(lnZ_new))
        return diff

    def output_histogram(self, lnZ, sum_k_Hk_Q, weighted_counts):
        """Recombine the data into a joined histogram

        Parameters
        ----------
        lnZ : pandas.Series, one per histogram (length n_hists)
            value of ln(Z_i) for each histogram i
        sum_k_Hk_Q : pandas.Series, one per bin (length n_bins)
            Sum over histograms for each histogram bin. See
            :meth:`.sum_k_Hk_Q`.
        weighted_counts : pandas.DataFrame, n_bins by n_hists
            the weighted matrix multiplied by the counts per histogram. See
            :meth:`.weighted_counts_tis`.

        Returns
        -------
        pandas.Series
            the WHAM-reweighted combined histogram, unnormalized
        """
        Z = np.exp(lnZ)
        Z0_over_Zi = Z.iloc[0] / Z
        output = pd.Series(index=sum_k_Hk_Q.index, name="WHAM",
                           dtype='float64')
        for val in sum_k_Hk_Q.index:
            sum_w_over_Z = sum([
                weighted_counts.loc[val, hist_i] * Z0_over_Zi[hist_i]
                for hist_i in Z.index
            ])
            # explicitly allow NaN results for simplcity (should only occur
            # when numerator and denominator are 0) ... this will leave NaNs
            # in the histogram in those locations; if all values of the
            # total histogram are NaN, that gets caught in the main
            # wham_bam_histogram routine
            with np.errstate(divide='ignore', invalid='ignore'):
                output[val] = sum_k_Hk_Q[val] / sum_w_over_Z

        return output

    @staticmethod
    def normalize_cumulative(series):
        """Normalize to maximum value"""
        return series/series.max()

    def guess_lnZ_crossing_probability(self, cleaned_df):
        """Guess ln(Z_i) based on crossing probabilities

        Parameters
        ----------
        cleaned_df : pandas.DataFrame
            cleaned input dataframe representing crossing probabilities

        Returns
        -------
        pandas.Series
            initial guess for values of ln(Z_i) for each histogram
        """
        df = cleaned_df.apply(lambda s: s/s.max())
        # pandas magic, see http://stackoverflow.com/questions/18327624
        first_nonzero = df.apply(lambda s: s[s == 1.0].index[0])
        df = df.loc[first_nonzero]
        guess_nextZ_over_Z = df.apply(lambda s: s.nlargest(2).iloc[1])
        guess_Z_data = [1.0]
        for i in range(len(guess_nextZ_over_Z)-1):
            guess_Z_data.append(guess_Z_data[-1] * guess_nextZ_over_Z.iloc[i])
        guess_lnZ = pd.Series(data=np.log(np.array(guess_Z_data)),
                              index=guess_nextZ_over_Z.index)
        return guess_lnZ

    def check_cleaned_overlaps(self, cleaned_df):
        """
        Check that all the histograms have sufficient overlaps.

        Parameters
        ----------
        cleaned_df : pandas.DataFrame
            cleaned input dataframe representing crossing probabilities

        Raises
        ------
        RuntimeError
            if the input doesn't have enough of an overlap
        """
        for col_idx in range(1, len(cleaned_df.columns)):
            col = cleaned_df.columns[col_idx]
            prev_col = cleaned_df.columns[col_idx - 1]

            col_data = cleaned_df[col]
            prev_data = cleaned_df[prev_col]

            first_nonzero = col_data[col_data != 0.0].index[0]
            if not prev_data[first_nonzero] > 0.0:
                # use not and > to account for NaNs
                raise RuntimeError(
                    "Insufficient overlap to combine histograms.\n"
                    + "This is either due to poor sampling or bad "
                    + "interface placement.\n" + "Row: "
                    + str(first_nonzero) + "   Column: " + str(col) + "\n"
                    + str(cleaned_df))

    def wham_bam_histogram(self, input_df):
        """
        Perform the entire wham process.

        Parameters
        ----------
        input_df : pandas.DataFrame
            input dataframe

        Returns
        -------
        pandas.Series
            the WHAM-reweighted combined histogram, normalized so max value
            is 1
        """
        cleaned = self.prep_reverse_cumulative(input_df)
        self.check_cleaned_overlaps(cleaned)
        guess = self.guess_lnZ_crossing_probability(cleaned)
        sum_k_Hk_Q = self.sum_k_Hk_Q(cleaned)
        n_entries = self.n_entries(cleaned)
        unweighting = self.unweighting_tis(cleaned)
        weighted_counts = self.weighted_counts_tis(unweighting,
                                                   n_entries)
        try:
            lnZ = self.generate_lnZ(guess, unweighting, weighted_counts,
                                    sum_k_Hk_Q)
        except IndexError as e:  # pragma: no cover
            # I don't think this can happen any more, but leave it in case
            # (Now the check_cleaned_overlaps should catch this problem.)
            failmsg = "Does your input to WHAM have enough data?"
            if not e.args:
                e.args = [failmsg]
            else:
                arg0 = e.args[0]+"\n"+failmsg
                e.args = tuple([arg0] + list(e.args[1:]))
                raise e

        hist = self.output_histogram(lnZ, sum_k_Hk_Q, weighted_counts)
        result = self.normalize_cumulative(hist)
        self.lnZ = lnZ
        if sum(pd.isnull(result)) == len(result):  # pragma: no cover
            # last safety check
            raise RuntimeError("WHAM result is all NaN. Reason unknown.")
        return result



def parsing(parseargs):  # pragma: no cover
    # TODO: switch to argparse.
    import optparse
    parser = optparse.OptionParser()
    parser.add_option("--tol", type="float", default=1e-12)
    parser.add_option("--max_iter", type="int", default=1000000)
    parser.add_option("--cutoff", type="float", default=0.05)
    parser.add_option("--pstats", type="string", default=None)
    parser.add_option("--float_format", type="string", default="10.8")
    opts, args = parser.parse_args(parseargs)
    return opts, args


if __name__ == "__main__":  # pragma: no cover
    opts, args = parsing(sys.argv[1:])
    wham = WHAM(tol=opts.tol, max_iter=opts.max_iter, cutoff=opts.cutoff)
    wham.float_format = opts.float_format
    df = wham.load_files(args)

    # keep around some stuff to allow us to do benchmarking and profiling
    if opts.pstats is not None:
        import cProfile
        import time
        start = time.time()
        cProfile.run("print wham.wham_bam_histogram(df)", opts.pstats)
        print(time.time() - start)
    else:
        wham_hist = wham.wham_bam_histogram(df)
        print(wham_hist.to_string(
            header=False,
            float_format=lambda x: "{:10.8f}".format(x)
        ))