Source code for openalea.stat_tool.comparison

#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""Comparison

.. topic:: comparison.py summary

    A module dedicated to Comparison tests

    :Code status: mature
    :Documentation status: to be completed
    :Author: Thomas Cokelaer <Thomas.Cokelaer@sophia.inria.fr>

    :Revision: $Id$
    
"""
__version__ = "$Id$"

from . import error

from .enums import (
    variable_type,
    output_format
)


from openalea.stat_tool._stat_tool import (
    _CompoundData,
    _DiscreteDistributionData,
    _DiscreteMixtureData,
    _ConvolutionData,
    _Vectors,
    _VectorDistance,
    _FrequencyDistribution
)

__all__ = ['Compare', 'ComparisonTest']


def compare_histo(histo, *args, **kargs):
    """Comparison of frequency distributions.

    :Parameters:
      * `histo1`, `histo2`, ... (histogram, mixture_data, convolution_data, compound_data),
      * `type` (string): variable type ("NUMERIC" ("N"), "ORDINAL" ("O") or "SYMBOLIC" ("S")).

    :Keywords:
      - FileName (string) : name of the result file
      - Format (string) : format of the result file: "ASCII" (default format) or "SpreadSheet".
        This optional argument can only be used in conjunction with the optional argument FileName.

    :Returns:
      The comparison result.

    :Examples:

    .. doctest::
        :options: +SKIP

        >>> compare_histo(histo1, histo2, ..., type, FileName="result",
        ... Format="ASCII")

    .. seealso::
        :func:`~openalea.stat_tool.comparison.Compare`

    """
    utype = args[-1]
    if utype not in list(variable_type.keys()):
        raise KeyError("%s not found. Allowed keys are %s"
                       % (utype, list(variable_type.keys())))


    utype = variable_type[args[-1]]

    error.CheckType([histo],
                        [[_DiscreteDistributionData, _DiscreteMixtureData,
                          _ConvolutionData, _CompoundData]])

    histos = args[0:-1]
    for h in histos:
        error.CheckType([h],
                        [[_DiscreteDistributionData, _DiscreteMixtureData,
                          _ConvolutionData, _CompoundData]])
        filename = kargs.get("Filename", "")
        if len(kargs) > 0 and 'Format' in kargs.keys():
            kargs['Format'] = str(kargs['Format']).upper() 
        format = error.ParseKargs(kargs, "Format",
                                  default="ASCII",
                                  possible=output_format)

    ret = histo.compare(histos, utype, filename, format)

    return ret


_FrequencyDistribution.compare_histo = compare_histo	


def compare_vectors(vec, vector_distance, Standardization=True):
    """Comparison of vectors.

    The type _VectorDistance implements standardization procedures.
    The objective of standardization is to avoid the dependence on
    the variable type (chosen among symbolic, ordinal, numeric and circular)
    and, for numeric variables, on the choice of the measurement units
    by converting the original variables to dimensionless variables.

    :Parameters:
     - `vec` (_Vectors) : test
     - `vector_distance` (_VectorDistance) : test

    :Returns:
      An object of type _DistanceMatrix is returned.

    :Examples:

    .. doctest::
        :options: +SKIP

        >>> compare_vectors(vec, vector_distance)

    .. seealso::
        :func:`~openalea.stat_tool.vectors.VectorDistance`,
        :func:`~openalea.stat_tool.cluster.Clustering`,
        :func:`~openalea.stat_tool.comparison.Compare`
     """
    error.CheckType([vec, vector_distance], [_Vectors, _VectorDistance])
    error.CheckType([Standardization], [bool])

    return vec.compare(vector_distance, Standardization)


[docs] def Compare(arg1, *args, **kargs): """Comparison functions factory :Parameters: - `arg1` should be in : * `compare_histo` : Histograms comparison * `compare_vectors` : Vectors comparison * `compare_seq` : Sequences comparison * `compare_markov` : Markovian models comparison .. seealso:: :func:`~openalea.stat_tool.comparison.compare_histo`, :func:`~openalea.stat_tool.comparison.compare_vectors` :func:`~openalea.stat_tool.comparison.compare_seq` :func:`~openalea.stat_tool.comparison.compare_markov` .. todo:: Get the AMAPMod documentation """ p1 = arg1 if isinstance(p1, _Vectors): ret = compare_vectors(arg1, *args, **kargs) elif isinstance(p1, _FrequencyDistribution): ret = compare_histo(arg1, *args, **kargs) else: raise NotImplementedError("First argument must be either Vectors or FrequencyDistribution") return ret
[docs] def ComparisonTest(utype, histo1, histo2): r""" Test of comparaison of frequency distributions. The objective is to compare two independent random samples in order to decide if they have been drawn from the same population or not. In the case of samples from normal populations, the Fisher-Snedecor ("F") test enables to test is the two variances are not significantly different. The normal distribution assumption should be checked for instance by the exam of the shape coefficients (skewness and kurtosis coefficients). The test statistic is: .. math:: F_{n_1-1,n_2-1} = \frac { \frac{\displaystyle\sum_{i=1}^{n_1}\left( x_{1i}-m_1 \right)^2}{n_1-1} } { \frac{\displaystyle\sum_{i=1}^{n_2}\left( x_{2i}-m_2 \right)^2}{n_2-1} } where :math:`m_1` and :math:`m_2` are the means of the samples. The Fisher-Snedecor variable :math:`F_{n_1-1,n_2-1}` with :math:`n_1-1` degrees of freedom and :math:`n_2-1` degrees of freedom can be interpreted as the ratio of the variance estimators of the two samples. In practice, the larger estimated variance is put at the denominator. Hence :math:`F_{n_1-1,n_2-1} \geq 1` . The critical region is of the form :math:`F_{n_1-1,n_2-1} > f` (one-sided test). In the case of samples from normal populations with equal variances, the Student ("T") test enables to test if the two means are not significantly different. The test statistic is: .. math:: T_{n_1+n_2 - 2} = \frac{m_1 - m_2}{ \sqrt{\left( \displaystyle\sum_{i=1}^{n_1}\left( x_{1i}-m_1 \right)^2{n_1-1} + \displaystyle\sum_{i=1}^{n_2}\left( x_{2i}-m_1 \right)^2{n_2-1} \right) \left( \frac{1}{n_1} + \frac{1}{n_2}\right) } } \sqrt{n_1 + n_2 - 2} The critical region is of the form :math:`\left| T_{n_1+n_2-2}\right| > t` (two-sided test). For sufficiently large sample sizes, this test of sample mean comparison can be used for samples from non-normal populations with unequal variances. This test is said to be robust. The Wilcoxon-Mann-Whitney ("W") test is a distribution-free test relying on the homogeneity of the ranking of the two sample (ranks of one sample should not cluster at either or both ends of the range). It can be seen as the non-parametric analog of the Student's t test and can be applied to compare two sets of observations measures on an interval scale when it is supposed that the data are non-normally distributed, or to compare two sets of observations measured on an ordinal scale. :Parameters: * type(string) : type of test "F" (Fisher-Snedecor), "T" (Student) or "W" (Wilcoxon-Mann-Whitney) * histo1, histo2 (Histogram, MixtureData, ConvolutionData, CompoundData) :Returns: A string containing the result of the tests :Examples: .. doctest:: :options: +SKIP >>> ComparisonTest(type, histo1, histo2) """ error.CheckType([histo1, histo2], [[_DiscreteDistributionData, _DiscreteMixtureData, _ConvolutionData, _CompoundData]]*2) utype = utype.lower() #todo: move this dict to enumerate.py ? type_dict = { "f": "f_comparison", "t": "t_comparison", "w": "wmw_comparison", } if utype not in type_dict: raise TypeError("to be done") func = getattr(histo1, type_dict[utype]) ret = func(histo2) return ret