#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""Data transformation functions
.. topic:: data_transform.py summary
A module dedicated to Output functions (plot, displya, save)
:Code status: mature
:Documentation status: to be completed
:Author: Samuel Dufour-Kowalski <samuel.dufour@sophia.inria.fr>
Thomas Cokelaer <Thomas.Cokelaer@sophia.inria.fr>
:Revision: $Id$
.. testsetup:: *
from openalea.stat_tool import *
.. warning:: sequence analysis package also contain a data transform module
"""
__revision__ = "$Id$"
from . import error
from openalea.stat_tool._stat_tool import (
_DiscreteMixture,
_DiscreteMixtureData,
_Compound,
_CompoundData,
_Convolution,
_ConvolutionData,
_Vectors
)
from .enums import keep_type
[docs]
def Merge(obj, *args):
"""
Merging of objects of the same 'data' type or merging of sample correlation functions.
:Parameters:
* histo1, histo2, ... (_Histogram, _DiscreteMixtureData, _ConvolutionData, _CompoundData),
* vec1, vec2, ... (_Vectors),
* timev1, timev2, ... (_TimeEvents, _RenewalData),
* seq1, seq2, ... (_Sequences, _DiscreteSequences, _MarkovData, _SemiMarkovData),
* top1, top2, ... (_Tops),
* correl1, correl2, ... (_Correlation).
:Returns:
If the arguments are of type _Histogram, _DiscreteMixtureData, _ConvolutionData, _CompoundData
an object of type _Histogram is returned.
If the arguments are of type _Vectors and if the vectors have the same number of variables,
an object of type vectors is returned, otherwise no object is returned.
If the arguments are of type _TimeEvents, _RenewalData, an object of type
_TimeEvents is returned.
If the arguments are of type _Sequences, _DiscreteSequences, _MarkovData, _SemiMarkovData
and if the sequences have the same number of variables, an object of type _Sequences
is returned.
If the arguments are of type _Tops, an object of type _Tops is returned.
If the arguments are of type correlation, an object of type correlation is returned.
:Examples:
.. doctest::
:options: +SKIP
>>> Merge(histo1, histo2,...)
>>> Merge(vec1, vec2,...)
>>> Merge(timev1, timev2,...)
>>> Merge(seq1, seq2,...)
>>> Merge(discrete_seq1, discrete_seq2,...)
>>> Merge(top1, top2,...)
>>> Merge(correl1, correl2,...)
.. seealso::
:func:`~openalea.stat_tool.cluster.Cluster`,
:func:`~openalea.stat_tool.data_transform.Shift`,
:func:`~openalea.stat_tool.cluster.Transcode`,
:func:`~openalea.stat_tool.data_transform.ValueSelect`,
:func:`~openalea.stat_tool.data_transform.MergeVariable`,
:func:`~openalea.stat_tool.data_transform.SelectIndividual`,
:func:`~openalea.stat_tool.data_transform.SelectVariable`,
:func:`~openalea.stat_tool.cluster.NbEventSelect`,
:func:`~openalea.stat_tool.cluster.TimeScaling`,
:func:`~openalea.stat_tool.cluster.TimeSelect`,
:func:`~openalea.stat_tool.cluster.AddAbsorbingRun`,
:func:`~openalea.stat_tool.cluster.Cumulate`,
:func:`~openalea.stat_tool.cluster.Difference`,
:func:`~openalea.stat_tool.cluster.IndexExtract`,
:func:`~openalea.stat_tool.cluster.LengthSelect`,
:func:`~openalea.stat_tool.cluster.MovingAverage`,
:func:`~openalea.stat_tool.cluster.RecurrenceTimeSequences`,
:func:`~openalea.stat_tool.cluster.RemoveRun`,
:func:`~openalea.stat_tool.cluster.Reverse`,
:func:`~openalea.stat_tool.cluster.SegmentationExtract`,
:func:`~openalea.stat_tool.cluster.VariableScaling`,
:func:`~openalea.stat_tool.cluster.RemoveApicalInternodes`
"""
ret = obj.merge(list(args))
if hasattr(ret, 'markovian_sequences'):
try:
newret = ret.markovian_sequences()
return newret
except:
import warnings
warnings.warn("""markovian_sequences did not work. Check that this
is normal behaviour""")
return ret
[docs]
def MergeVariable(obj, *args, **kargs):
"""
Merging of variables.
:Parameters:
* vec1, vec2, ... (_Vectors),
* seq1, seq2, ... (_Sequences, _DiscreteSequences, _MarkovData, _SemiMarkovData).
:Keywords:
* RefSample (int): reference sample to define individual identifiers
(the default: no reference sample).
:Returns:
If the arguments are of type _Vectors and if the number of vectors is the same
for each sample, an object of type _Vectors is returned.
If the arguments are of type _Sequences, _DiscreteSequences, _MarkovData, _SemiMarkovData,
if all the variables are of type STATE, and if the number and the lengths of sequences
are the same for each sample, an object of type _Sequences or _DiscreteSequences is returned.
The returned object is of type _DiscreteSequences if all the variables are of type STATE,
if the possible values for each variable are consecutive from 0 and if the number of
possible values for each variable is < 15.
:Examples:
.. doctest::
:options: +SKIP
>>> MergeVariable(histo1, histo2)
>>> MergeVariable(vec1, vec2,..., RefSample=2)
>>> MergeVariable(seq1, seq2,..., RefSample=2)
.. seealso::
:func:`~openalea.stat_tool.cluster.Cluster`,
:func:`~openalea.stat_tool.data_transform.Shift`,
:func:`~openalea.stat_tool.cluster.Transcode`,
:func:`~openalea.stat_tool.data_transform.ValueSelect`,
:func:`~openalea.stat_tool.data_transform.Merge`,
:func:`~openalea.stat_tool.data_transform.SelectIndividual`,
:func:`~openalea.stat_tool.data_transform.SelectVariable`,
:func:`~openalea.stat_tool.cluster.AddAbsorbingRun`,
:func:`~openalea.stat_tool.cluster.Cumulate`,
:func:`~openalea.stat_tool.cluster.Difference`,
:func:`~openalea.stat_tool.cluster.IndexExtract`,
:func:`~openalea.stat_tool.cluster.LengthSelect`,
:func:`~openalea.stat_tool.cluster.MovingAverage`,
:func:`~openalea.stat_tool.cluster.RecurrenceTimeSequences`,
:func:`~openalea.stat_tool.cluster.RemoveRun`,
:func:`~openalea.stat_tool.cluster.Reverse`,
:func:`~openalea.stat_tool.cluster.SegmentationExtract`,
:func:`~openalea.stat_tool.cluster.VariableScaling`,
"""
#todo:manage the marjkovian_sequences conversion if input
# is made os Sequences
arg1 = args[0]
for arg in args:
error.CheckType( [arg], [type(arg1)])
RefSample = kargs.get("RefSample", -1)
error.CheckType([RefSample], [int])
return obj.merge_variable(list(args), RefSample)
[docs]
def SelectVariable(obj, variables, Mode="Keep"):
"""
Selection of variables.
:Parameters:
* vec (vectors),
* seq (sequences, discrete_sequences, markov_data, semi-markov_data),
* variable (int): variable index.
* variables (array(int)): variable indices.
:Keywords:
* Mode (string): conservation or rejection of the selected variables: "Keep" (default) or "Reject".
:Returns:
If either variable or variables[1], ..., variables[n] are valid indices of variables,
an object of type vectors (respectively sequences or discrete_sequences) is returned,
otherwise no object is returned. In the case of a first argument of type sequences,
the returned object is of type discrete_sequences if all the variables are of type STATE,
if the possible values for each variable are consecutive from 0 and if the number of
possible values for each variable is < 15.
:Examples:
.. doctest::
:options: +SKIP
>>> SelectVariable(vec, variable, Mode="Reject")
>>> SelectVariable(vec, variables, Mode="Reject")
>>> SelectVariable(seq, variable, Mode="Reject")
>>> SelectVariable(seq, variables, Mode="Reject")
.. seealso::
`AddAbsorbingRun`,
:func:`~openalea.stat_tool.cluster.Cluster`,
:func:`~openalea.stat_tool.cumulate.Cumulate`,
`Difference`,
`IndexExtract`,
`LengthSelect`,
:func:`~openalea.stat_tool.data_transform.Merge`,
:func:`~openalea.stat_tool.data_transform.MergeVariable`,
`MovingAverage`,
`RecurrenceTimeSequences`,
`RemoveRun`,
`Reverse`,
:func:`~openalea.stat_tool.data_transform.SelectIndividual`,
:func:`~openalea.stat_tool.data_transform.Shift`,
:func:`~openalea.stat_tool.cluster.Transcode`,
:func:`~openalea.stat_tool.data_transform.ValueSelect`,
`SegmentationExtract`,
`VariableScaling`.
"""
error.CheckType([variables, Mode] , [[int, list], str])
#todo: check that Mode is in ["Keep", "Reject"]
keep = bool(Mode == "Keep" or Mode == "keep")
if isinstance(variables, int):
variables = [variables]
ret = obj.select_variable(variables, keep)
return ret
[docs]
def SelectIndividual(obj, identifiers, Mode="Keep"):
"""
Selection of vectors, sequences, tops or patterns (in a dissimilarity matrix).
:Parameters:
* vec (vectors),
* seq (sequences, discrete_sequences, markov_data, semi-markov_data),
* top (tops),
* dist_matrix (distance_matrix),
* identifiers (array(int)): identifiers.
:Keywords:
Mode (string): conservation or rejection of the selected individuals: "Keep" (default) or "Reject".
:Returns:
If identifiers[1], ..., identifiers[n] are valid identifiers of vectors (respectively
sequences, tops or patterns compared in a dissimilarity matrix), an object of type vectors
(respectively sequences or discrete_sequences, tops or distance_matrix) is returned,
otherwise no object is returned. In the case of a first argument of type sequences,
discrete_sequences, markov_data, semi-markov_data, the returned object is of type
discrete_sequences if all the variables are of type STATE, if the possible values for
each variable are consecutive from 0 and if the number of possible values for each variable
is < 15.
:Examples:
.. doctest::
:options: +SKIP
>>> SelectIndividual(vec, identifiers, Mode="Reject")
>>> SelectIndividual(seq, identifiers, Mode="Reject")
>>> SelectIndividual(top, identifiers, Mode="Reject")
>>> SelectIndividual(dist_matrix, identifiers, Mode="Reject")
.. seealso::
:func:`~openalea.stat_tool.cluster.Cluster`,
:func:`~openalea.stat_tool.data_transform.Merge`,
:func:`~openalea.stat_tool.data_transform.Shift`,
:func:`~openalea.stat_tool.cluster.Transcode`,
:func:`~openalea.stat_tool.data_transform.ValueSelect`,
:func:`~openalea.stat_tool.data_transform.MergeVariable`,
:func:`~openalea.stat_tool.data_transform.SelectVariable`
`AddAbsorbingRun`,
`Cumulate`,
`Difference`,
`IndexExtract`,
`LengthSelect`,
`MovingAverage`,
`RecurrenceTimeSequences`,
`RemoveSeries`,
`Reverse`,
`SegmentationExtract`,
`VariableScaling`,
`RemoveApicalInternodes`,
`Symmetrize`.
"""
error.CheckType([identifiers, Mode] , [list, str])
#todo: CHECK THAT Mode is in ["Keep", "Reject"]
keep = bool(Mode == "Keep" or Mode == "keep")
ret = None
try:
ret = obj.select_individual(identifiers, keep)
except:
raise Exception("Could not run extract_data on the input variable. ")
#if ret:
# try:
# if obj is a sequence, returns markovian_sequences
# return ret.markovian_sequences()
# except AttributeError:
# return ret
#else:
# raise Exception("Must not enter here")
# the code above prevent tests to succeed.
return ret
[docs]
def ValueSelect(obj, *args, **kargs):
"""Selection of individuals according to the values taken by a variable
:Parameters:
* histo (histogram, mixture_data, convolution_data, compound_data),
* value (int): value,
* min_value (int): minimum value,
* max_value (int): maximum value,
* vec1 (vectors): values,
* vecn (vectors): vectors,
* variable (int): variable index,
* seq1 (sequences, discrete_sequences, markov_data, semi-markov_data): univariate sequences,
* seqn (sequences, discrete_sequences, markov_data, semi-markov_data): multivariate sequences.
:Keywords:
* Mode (string): conservation or rejection of selected individuals: "Keep" (the default) or "Reject".
:Returns:
If either value 0 or if 0 < min_value < max_value and if the range of values defined either
by value or by min_value and max_value enables to select individuals, an object of
type HISTOGRAM is returned (respectively vectors, sequences or discrete_sequences),
otherwise no object is returned. In the case of a first argument of type sequences,
discrete_sequences, markov_data or semi-markov_data, the returned object is of type
discrete_sequences if all the variables are of type STATE, if the possible values for each
variable are consecutive from 0 and if the number of possible values for each variable is < 15.
:Examples:
.. doctest::
:options: +SKIP
>>> ValueSelect(histo, value, Mode="Reject")
>>> ValueSelect(histo, min_value, max_value, Mode="Reject")
>>> ValueSelect(vec1, value, Mode="Reject")
>>> ValueSelect(vec1, min_value, max_value, Mode="Reject")
>>> ValueSelect(vecn, variable, value, Mode="Reject")
>>> ValueSelect(vecn, variable, min_value, max_value, Mode="Reject")
>>> ValueSelect(seq1, value, Mode="Reject")
>>> ValueSelect(seq1, min_value, max_value, Mode="Reject")
>>> ValueSelect(seqn, variable, value, Mode="Reject")
>>> ValueSelect(seqn, variable, min_value, max_value, Mode="Reject")
.. seealso::
:func:`~openalea.stat_tool.cluster.Cluster`,
:func:`~openalea.stat_tool.data_transform.Merge`,
:func:`~openalea.stat_tool.data_transform.Shift`,
:func:`~openalea.stat_tool.data_transform.Transcode`,
:func:`~openalea.stat_tool.data_transform.SelectIndividual`,
:func:`~openalea.stat_tool.data_transform.MergeVariable`,
:func:`~openalea.stat_tool.data_transform.SelectVariable`
Cumulate`
Difference`
IndexExtract`
LengthSelect`,
MovingAverage`,
RecurrenceTimeSequences`
RemoveRun`,
Reverse`,
SegmentationExtract`,
VariableScaling`.
"""
error.CheckArgumentsLength(args, 1, 3)
Mode = error.ParseKargs(kargs, "Mode", "Keep", keep_type)
#keep = bool(Mode == "Keep" or Mode == "keep")
keep = bool(Mode == "Keep")
# Test for vectors
try:
nb_variable = obj.nb_variable
except AttributeError:
nb_variable = 0
if len(args) == 3 :
variable, umin, umax = args
elif len(args) == 2:
# 2 cases (min_value, max_value) or (variable, value)
if nb_variable:
variable, umin = args
umax = umin
else:
umin, umax = args
elif len(args) == 1:
value = args[0]
error.CheckType([value], [[int, tuple, list]])
if isinstance(value, tuple) and len(value) == 2:
umin, umax = value
else:
umin = umax = value
if(nb_variable): # Vectors, sequences
return obj.value_select(variable, umin, umax, keep)
else:
return obj.value_select(umin, umax, keep)
[docs]
def Shift(obj, *args):
"""
Shifting of values
:Parameters:
* histo (histogram, mixture_data, convolution_data, compound_data),
* param (int): shifting parameter,
* vec1 (vectors): values,
* vecn (vectors): vectors,
* variable (int): variable index,
* seq1 (sequences): univariate sequences,
* seqn (sequences): multivariate sequences.
:Returns:
If the shifting makes that the lower bound to the possible values is positive, an
object of type HISTOGRAM (respectively _Vectors, _Sequences) is returned. In the
case of a first argument of type sequences, the returned object is of type
discrete_sequences if all the variables are of type STATE, if the possible values
for each variable are consecutive from 0 and if the number of possible values for
each variable is 15.
:Examples:
.. doctest::
:options: +SKIP
>>> Shift(histo, param)
>>> Shift(vec1, param)
>>> Shift(vecn, variable, param)
>>> Shift(seq1, param)
>>> Shift(seqn, variable, param)
.. seealso::
:func:`~openalea.stat_tool.cluster.Cluster`,
:func:`~openalea.stat_tool.data_transform.Merge`,
:func:`~openalea.stat_tool.data_transform.Transcode`,
:func:`~openalea.stat_tool.data_transform.SelectIndividual`,
:func:`~openalea.stat_tool.data_transform.MergeVariable`,
:func:`~openalea.stat_tool.data_transform.SelectVariable`
:func:`~openalea.stat_tool.data_transform.AddAbsorbingRun`,
:func:`~openalea.stat_tool.data_transform.Cumulate`,
:func:`~openalea.stat_tool.data_transform.Difference`,
:func:`~openalea.stat_tool.data_transform.Lengthselect`,
:func:`~openalea.stat_tool.data_transform.MovingAverage`,
:func:`~openalea.stat_tool.data_transform.IndexExtract`,
:func:`~openalea.stat_tool.data_transform.RecurrenceTimeSequences`,
:func:`~openalea.stat_tool.data_transform.RemoveRun`,
:func:`~openalea.stat_tool.data_transform.Reverse`,
:func:`~openalea.stat_tool.data_transform.SegmentationExtract`,
:func:`~openalea.stat_tool.data_transform.ValueSelect`,
:func:`~openalea.stat_tool.data_transform.VariableScaling`.
"""
error.CheckArgumentsLength(args, 1, 3)
try:
nb_variable = obj.nb_variable
except AttributeError:
nb_variable = 0
if nb_variable == 1:
param = args[0]
ret = obj.shift(1, param)
elif nb_variable > 1:
variable = args[0]
param = args[1]
ret = obj.shift(variable, param)
else:
param = args[0]
ret = obj.shift(param)
return ret
[docs]
def Fit(histo, dist):
r"""
Fit of a frequency distribution by a theoretical distribution.
The result is displayed in the shell window (characteristics of the frequency
and theoretical distributions, log-likelihood of the data for the theoretical distribution,
information - maximum possible log-likelihood of the data -, c2 goodness of fit test).
The difference between the information measure and the log-likelihood is the
Kullback-Leibler divergence from the observed distribution to the theoretical distribution.
It is also one-half the deviance of the theoretical distribution.
Assume that a sample of size n is generated by a given random variable.
The statistic measures the random deviation between the observed frequencies fi and the
theoretical frequencies npi:
.. math::
D^2 = \sum_{i=0}^k \frac{\left(f_i - n p_i\right)^2}{n p_i}
\textrm{with} \sum_{i=0}^k f_i = n
If each theoretical frequency npi is greater than a given threshold (between 1 and 5
according to the authors), has a c2 with k - 1 degrees of freedom.
:Parameters:
* histo (histogram, mixture_data, convolution_data, compound_data),
* dist (distribution, mixture, convolution, compound).
:Returns:
Distribution
:Examples:
.. doctest::
:options: +SKIP
>>> Fit(histo, dist)
.. todo:: documenation: get back the latex equations
"""
return histo.fit(dist)
[docs]
def Unnormalize(obj):
"""Unnormalize
:Parameters:
* dist_matrix (distance_matrix).
:Returns:
An object of type distance_matrix is returned.
:Examples:
.. doctest::
:options: +SKIP
>>> Unnormalize(dist_matrix)
"""
return obj.unnormalize()
[docs]
def Symmetrize(obj):
"""
.. todo:: documentation
"""
ret = None
try:
ret = obj.symmetrize()
except ValueError:
pass
if ret:
return ret
[docs]
def TruncateDistribution(obj, variable):
"""
.. todo:: to be tests
"""
return obj.truncate(variable)
[docs]
def SelectStep(obj, *args):
"""Change the internal step of a vector or a sequence
:param obj: the vector or sequence objet
:param argument 1: the new step
:Example:
.. doctest::
:options: +SKIP
>>> seq = Sequences([])
>>> SelectStep(seq, 100)
>>> Plot(seq)
.. todo:: shall we move this function to sequence_analysis package?
"""
error.CheckArgumentsLength(args, 1, 2)
try:
nb_variable = obj.nb_variable
except AttributeError:
raise TypeError("object has no nb_variable. Check that it is a Vector or Sequence")
if len(args)==2:
variable, step = args
error.CheckType([step], [[int, float]])
error.CheckType([variable], [[int]])
elif len(args)==1 and nb_variable==1:
variable = 1
step = args[0]
error.CheckType([step], [[int, float]])
else:
if nb_variable!=1:
raise SyntaxError("Wrong number of arguments. The number of variable is greater than 1 (%s) therefore you must provide a variable and a step like in SelectStep(object, 1, 100)" % nb_variable)
else:
raise ValueError("UnknownError")
#obj.get_marginal_histogram(variable)
ret = obj.select_step(variable, step)
return ret