BernsteinCopula: Improve doc

sofianehaddad · Nov 21, 2024 · 8c1c360 · 8c1c360
1 parent 3acf67d
commit 8c1c360
Show file tree

Hide file tree

Showing 7 changed files with 233 additions and 73 deletions.
diff --git a/python/doc/bibliography.rst b/python/doc/bibliography.rst
@@ -442,6 +442,8 @@ Bibliography
 .. [ScottStewart2011] W. F. Scott & B. Stewart.
     *Tables for the Lilliefors and Modified Cramer–von Mises Tests of Normality.*,
     Communications in Statistics - Theory and Methods. Volume 40, 2011 - Issue 4. Pages 726-730.
+.. [segers2016] J. Segers &  M. Sibuya &  H. TsukaharaSen (2016). *The Empirical Beta Copula*,
+   `pdf <https://arxiv.org/pdf/1607.04430>`__
 .. [sen1990] Sen, A., & Srivastava, M. (1990). *Regression analysis: theory, methods, and applications*.
     Springer.
 .. [shao1993] Shao, J. (1993). *Linear model selection by cross-validation.*

diff --git a/.../examples/calibration/least_squares_and_gaussian_calibration/plot_calibration_chaboche.py b/.../examples/calibration/least_squares_and_gaussian_calibration/plot_calibration_chaboche.py
@@ -60,6 +60,7 @@
 # class.
 
 # %%
+ot.RandomGenerator.SetSeed(0)
 cm = chaboche_model.ChabocheModel()
 print(cm.data)
 observedStrain = cm.data[:, 0]

diff --git a/...mples/data_analysis/distribution_fitting/plot_model_singular_multivariate_distribution.py b/...mples/data_analysis/distribution_fitting/plot_model_singular_multivariate_distribution.py
@@ -5,7 +5,7 @@
 
 # %%
 # From time to time we need to model singular :math:`n_D` distributions
-# (e.g. the joint distribution of KL coefficients for curves resulting from the transport of a low dimensional random vector).
+# (e.g. the joint distribution of Karhunen Loeve coefficients for curves resulting from the transport of a low dimensional random vector).
 # A way to do that is to use an :class:`~openturns.EmpiricalBernsteinCopula` with a bin number equal to the sample size
 # (also called the empirical beta copula in this case).
 
@@ -41,29 +41,77 @@ def draw(dist, Y):
 
 
 # %%
-# Generate some multivariate data to estimate, with correlation.
+# We consider the function :math:`f: \Rset^3 \rightarrow \Rset` defined by:
+#
+# .. math::
+#
+#      f(u, v_1, v_2) = (y_1, y_2)
+#
+# where:
+#
+# .. math::
+#
+#      y_1 & = \sin(u) / (1 + \cos(u)^2) + 0.05 * v_1 \\
+#      y_2 & = \sin(u) \cos(u) / (1 + \cos(u)^2) + 0.05 * v_2
+#
+# We define the following input random vector:
+#
+# .. math::
+#
+#      U  \sim \cU(-0.85\pi, 0.85\pi) \\
+#      (V_1, V_2)  \sim \cN(\vect{\mu} = \vect{0}, \vect{\sigma} = \vect{1}, Id_2)\\
+#
+# with :math:`U` and :math:`\vect{V})` independent.
+#
+# We define the output random vector :math:`\vect{Y}` as:
+#
+# .. math::
+#
+#      \vect{Y} = f(U, V_1, V_2)
+
 f = ot.SymbolicFunction(
-    ["U", "xi1", "xi2"],
-    ["sin(U)/(1+cos(U)^2)+0.05*xi1", "sin(U)*cos(U)/(1+cos(U)^2)+0.05*xi2"],
+    ["U", "v1", "v2"],
+    ["sin(U)/(1+cos(U)^2)+0.05*v1", "sin(U)*cos(U)/(1+cos(U)^2)+0.05*v2"],
 )
 U = ot.Uniform(-0.85 * m.pi, 0.85 * m.pi)
-xi = ot.Normal(2)
-X = ot.BlockIndependentDistribution([U, xi])
+V = ot.Normal(2)
+X = ot.BlockIndependentDistribution([U, V])
+
+# %%
+# We generate a sample of the output random vector :math:`\vect{Y}` of size :math:`N`.
 N = 200
-Y = f(X.getSample(N))
+sample_Y = f(X.getSample(N))
 
 # %%
-# Estimation by multivariate kernel smoothing.
-multi_ks = ot.KernelSmoothing().build(Y)
-view = viewer.View(draw(multi_ks, Y))
+# We estimate the distribution of the output random vector :math:`\vect{Y}` by multivariate kernel smoothing.
+y_multi_ks = ot.KernelSmoothing().build(sample_Y)
+view = viewer.View(draw(y_multi_ks, sample_Y))
 
 # %%
-# Estimation by empirical beta copula.
-beta_copula = ot.EmpiricalBernsteinCopula(Y, len(Y))
+# Now, we estimate the distribution of :math:`\vect{Y}` splitting the estimation of the marginals
+# from the estimation of the copula:
+#
+# - the marginals are fitted by kernel smoothing,
+# - the copula is fitted using the Bernstein copula factory :class:`~openturns.BernsteinCopulaFactory` that builds
+#   an empirical Bernstein copula.
+#
+# First, we do not specify the bin number :math:`m`. It is equal to the value computed by the default method, which is the
+# LogLikelihood criteria. We get :math:`m=1`, which
+# means that one cell is created: the built copula is diffuse in :math:`[0,1]^2`. The estimated copula is
+# the independent copula.
+empBern_copula = ot.BernsteinCopulaFactory().buildAsEmpiricalBernsteinCopula(sample_Y)
+print('bin number computed m = ', empBern_copula.getBinNumber())
 marginals = [
-    ot.KernelSmoothing().build(Y.getMarginal(j)) for j in range(Y.getDimension())
+    ot.KernelSmoothing().build(sample_Y.getMarginal(j)) for j in range(sample_Y.getDimension())
 ]
-beta_dist = ot.JointDistribution(marginals, beta_copula)
-view = viewer.View(draw(beta_dist, Y))
+y_empBern = ot.JointDistribution(marginals, empBern_copula)
+view = viewer.View(draw(y_empBern, sample_Y))
 
+# %%
+# Now, we specify a bin number equal to the sample size: :math:`m = N` so that the built copula is very close to the sample.
+# With this parametrization, the empirical Bernstein copula is the *Beta copula* in the sens of [segers2016]_.
+# In that case, it manages to reproduce its specific feature.
+empBern_copula = ot.BernsteinCopulaFactory().build(sample_Y, N)
+y_empBern = ot.JointDistribution(marginals, empBern_copula)
+view = viewer.View(draw(y_empBern, sample_Y))
 viewer.View.ShowAll()
diff --git a/python/src/BernsteinCopulaFactory_doc.i.in b/python/src/BernsteinCopulaFactory_doc.i.in
@@ -1,9 +1,22 @@
 %feature("docstring") OT::BernsteinCopulaFactory
-"BernsteinCopula copula factory.
+"EmpiricalBernsteinCopula factory.
 
 Notes
 -----
-This class provides a non parametric estimator of a copula: the :class:`~openturns.EmpiricalBernsteinCopula`.
+This class builds an :class:`~openturns.EmpiricalBernsteinCopula` which is a non parametric fitting of
+the copula of a multivariate distribution.
+
+The keys of :class:`~openturns.ResourceMap` related to the class are:
+
+- the keys `BernsteinCopulaFactory-MinM` and `BernsteinCopulaFactory-MaxM` that define the range of :math:`m`
+  in the optimization
+  problems computing the optimal bin number according to a specified criterion,
+- the key `BernsteinCopulaFactory-BinNumberSelection` that defines the  criterion to compute the optimal bin number
+  when it is not specified. The possible choices are 'AMISE', 'LogLikelihood', 'PenalizedCsiszarDivergence';
+- the key `BernsteinCopulaFactory-KFraction` that defines the fraction of the sample used for the validation in the
+  method :meth:`ComputeLogLikelihoodBinNumber`,
+- the key `BernsteinCopulaFactory-SamplingSize`  that defines the :math:`N` parameter used in the
+  method :meth:`ComputePenalizedCsiszarDivergenceBinNumber`.
 
 See also
 --------
@@ -12,66 +25,91 @@ DistributionFactory, EmpiricalBernsteinCopula"
 // ---------------------------------------------------------------------
 
 %feature("docstring") OT::BernsteinCopulaFactory::build
-"Build the nonparametric Bernstein copula estimator based on the empirical copula.
+"Build the empirical Bernstein copula.
 
 **Available usages**:
 
     build()
 
     build(*sample*)
 
-    build(*sample, method, objective*)
-
     build(*sample, m*)
 
+    build(*sample, method, f*)
+
 Parameters
 ----------
-sample : 2-d sequence of float, of dimension *d*
-    The sample of size :math:`n>0` from which the copula is estimated.
+sample : 2-d sequence of float, of dimension :math:`d`
+    The sample of size :math:`\sampleSize>0` from which the copula is estimated.
 method : str
-    The name of the bin number selection method. Possible choices are *AMISE*, *LogLikelihood* and *PenalizedCsiszarDivergence*. Default is *LogLikelihood*, given by the *'BernsteinCopulaFactory-BinNumberSelection'* entry of :class:`~openturns.ResourceMap`.
-m : int
-    The number of sub-intervals in which all the edges of the unit cube
+    The name of the bin number selection method. Possible choices are *AMISE*, *LogLikelihood* and *PenalizedCsiszarDivergence*.
+
+    Default is *LogLikelihood*.
+f : :class:`~openturns.Function`
+    The function defining the Csiszar divergence of interest used by
+    the  *PenalizedCsiszarDivergence* method.
+
+    Default is Function().
+m : int,:math:`1 \leq m \leq \sampleSize`,
+    The bin number, i.e. the number of sub-intervals in which all the edges of the unit cube
     :math:`[0, 1]^d` are regularly partitioned.
+
+    Default value is the value computed from the default bin number selection method.
 
 Returns
 -------
 copula : :class:`~openturns.Distribution`
-    The estimated copula as a generic distribution.
+    The empirical Bernstein copula as a generic distribution.
 
+Notes
+-----
+If the bin number :math:`m` is specified and does not divide the sample size :math:`\sampleSize`, then a part of the sample is
+removed for the result to be a copula. See :class:`~openturns.EmpiricalBernsteinCopula`.
 "
 // ---------------------------------------------------------------------
 
 %feature("docstring") OT::BernsteinCopulaFactory::buildAsEmpiricalBernsteinCopula
-"Build the nonparametric Bernstein copula estimator based on the empirical copula.
+"Build the empirical Bernstein copula as a native distribution.
 
 **Available usages**:
 
     buildAsEmpiricalBernsteinCopula()
 
     buildAsEmpiricalBernsteinCopula(*sample*)
 
-    buildAsEmpiricalBernsteinCopula(*sample, method, objective*)
-
     buildAsEmpiricalBernsteinCopula(*sample, m*)
 
+    buildAsEmpiricalBernsteinCopula(*sample, method, f*)
+
 Parameters
 ----------
 sample : 2-d sequence of float, of dimension *d*
-    The sample of size :math:`n>0` from which the copula is estimated.
+    The sample of size :math:`\sampleSize>0` from which the copula is estimated.
 method : str
     The name of the bin number selection method. Possible choices are *AMISE*, *LogLikelihood* and
-    *PenalizedCsiszarDivergence*. Default is *LogLikelihood*, given by the *'BernsteinCopulaFactory-BinNumberSelection'*
-    entry of :class:`~openturns.ResourceMap`.
-m : int
-    The number of sub-intervals in which all the edges of the unit cube
+    *PenalizedCsiszarDivergence*.
+
+    Default is *LogLikelihood*.
+f : :class:`~openturns.Function`
+    The function defining the Csiszar divergence of interest used by
+    the  *PenalizedCsiszarDivergence* method.
+
+    Default is Function().
+m : int, :math:`1 \leq m \leq \sampleSize`,
+    The bin number, i.e. the number of sub-intervals in which all the edges of the unit cube
     :math:`[0, 1]^d` are regularly partitioned.
+
+    Default value is the value computed from the default bin number selection method.
 
 Returns
 -------
 copula : :class:`~openturns.EmpiricalBernsteinCopula`
-    The estimated copula as an empirical Bernstein copula.
+    The empirical Bernstein copula as a native distribution.
 
+Notes
+-----
+If the bin number :math:`m` is specified and does not divide the sample size :math:`\sampleSize`, then a part of the sample is
+removed for the result to be a copula a copula. See :class:`~openturns.EmpiricalBernsteinCopula`.
 "
 // ---------------------------------------------------------------------
 
@@ -85,13 +123,16 @@ sample : 2-d sequence of float, of dimension 1
 
 Notes
 -----
-The number of bins is computed by minimizing the asymptotic mean integrated squared error (AMISE), leading to
+The bin number :math:`m` is computed by minimizing the asymptotic mean integrated squared error (AMISE),
+leading to:
 
 .. math::
 
-    m = 1+\left\lfloor n^{\dfrac{2}{4+d}} \right\rfloor
+    m = 1+\left\lfloor \sampleSize^{\frac{2}{4+d}} \right\rfloor
+
+where :math:`\lfloor x \rfloor` is the largest integer less than or equal to :math:`x`, :math:`\sampleSize` the sample size and :math:`d` the sample dimension.
 
-where :math:`\lfloor x \rfloor` is the largest integer less than or equal to :math:`x`, :math:`n` the sample size and :math:`d` the sample dimension.
+Note that this optimal :math:`m` does not necessarily divide the sample size :math:`\sampleSize`.
 "
 
 // ---------------------------------------------------------------------
@@ -102,27 +143,32 @@ where :math:`\lfloor x \rfloor` is the largest integer less than or equal to :ma
 Parameters
 ----------
 sample : 2-d sequence of float, of dimension 1
-    The sample of size :math:`n` from which the optimal log-likelihood bin number is computed.
-kFraction : int, :math:`0<kFraction<n`
+    The sample of size :math:`\sampleSize` from which the optimal log-likelihood bin number is computed.
+kFraction : int, :math:`0<kFraction<\sampleSize`
     The fraction of the sample used for the validation.
+
+    Default value 2.
 
 Notes
 -----
-Let :math:`\cE=\left\{\vect{X}_1,\dots,\vect{X}_n\right\}` be the given sample. If :math:`kFraction=1`, the bin number :math:`m` is given by:
+Let :math:`\cE= (\inputReal_1, \dots, \inputReal_\sampleSize)` be the given sample. If :math:`kFraction=1`, the bin number :math:`m` is given by:
 
 .. math::
 
-    m = \argmin_{M\in\{1,\dots,n\}}\dfrac{1}{n}\sum_{\vect{X}_i\in\cE}-\log c^{\cE}_{M}(\vect{X}_i)
+    m = \argmin_{M\in\{1,\dots,\sampleSize\}}\dfrac{1}{\sampleSize}\sum_{\vect{x}_i\in\cE}-\log c^{\cE}_{M}(\vect{x}_i)
 
 where :math:`c_M^{\cE}` is the density function of the :class:`~openturns.EmpiricalBernsteinCopula` associated to the sample :math:`\cE` and the bin number :math:`M`.
 
 If :math:`kFraction>1`, the bin number :math:`m` is given by:
 
 .. math::
 
-    m = \argmin_{M\in\{1,\dots,n\}}\dfrac{1}{kFraction}\sum_{k=0}^{kFraction-1}\dfrac{1}{n}\sum_{\vect{X}_i\in\cE^V_k}-\log c^{\cE^L_k}_{M}(\vect{X}_i)
+    m = \argmin_{M\in\{1,\dots,\sampleSize\}}\dfrac{1}{kFraction}\sum_{k=0}^{kFraction-1}\dfrac{1}{\sampleSize}\sum_{\vect{x}_i\in\cE^V_k}-\log c^{\cE^L_k}_{M}(\vect{x}_i)
 
-where :math:`\cE^V_k=\left\{\vect{X}_i\in\cE\,|\,i\equiv k \mod kFraction\right\}` and :math:`\cE^L_k=\cE \backslash \cE^V_k`
+where :math:`\cE^V_k=\left\{\vect{x}_i\in\cE\,|\,i\equiv k \mod kFraction\right\}`
+and :math:`\cE^L_k=\cE \backslash \cE^V_k`.
+
+Note that this optimal :math:`m` does not necessarily divide the sample size :math:`\sampleSize`.
 "
 
 // ---------------------------------------------------------------------
@@ -133,22 +179,27 @@ where :math:`\cE^V_k=\left\{\vect{X}_i\in\cE\,|\,i\equiv k \mod kFraction\right\
 Parameters
 ----------
 sample : 2-d sequence of float, of dimension 1
-    The sample of size :math:`n` from which the optimal AMISE bin number is computed.
+    The sample of size :math:`\sampleSize` from which the optimal AMISE bin number is computed.
 f : :class:`~openturns.Function`
     The function defining the Csiszar divergence of interest.
 alpha : float, :math:`\alpha\geq 0`
     The penalization factor.
 
 Notes
 -----
-Let :math:`\cE=\left\{\vect{X}_1,\dots,\vect{X}_n\right\}` be the given sample. The bin number :math:`m` is given by:
+Let :math:`\cE=(\inputReal_1, \dots, \inputReal_\sampleSize)` be the given sample. The bin number :math:`m` is given by:
 
 .. math::
 
-    m = \argmin_{M\in\{1,\dots,n\}}\left[\hat{D}_f(c^{\cE}_{M})-\dfrac{1}{n}\sum_{\vect{X}_i\in\cE}f\left(\dfrac{1}{c^{\cE}_{M}(\vect{X}_i)}\right)\right]^2-[\rho_S(c^{\cE}_{M})-\rho_S({\cE}_{M})]^2
+    m = \argmin_{M\in\{1,\dots,\sampleSize\}}\left[\hat{D}_f(c^{\cE}_{M})-\dfrac{1}{\sampleSize}\sum_{\vect{x}_i\in\cE}f\left(\dfrac{1}{c^{\cE}_{M}(\vect{x}_i)}\right)\right]^2-[\rho_S(c^{\cE}_{M})-\rho_S({\cE}_{M})]^2
+
+where :math:`c_M^{\cE}` is the density function of the :class:`~openturns.EmpiricalBernsteinCopula` associated to the sample
+:math:`\cE` and the bin number :math:`M`, :math:`\hat{D}_f(c^{\cE}_{M})=\dfrac{1}{N}\sum_{j=1}^Nf\left(\dfrac{1}{\vect{u}_j}\right)` a
+Monte Carlo estimate of the Csiszar :math:`f` divergence, :math:`\rho_S(c^{\cE}_{M})` the exact Spearman correlation of the empirical
+Bernstein copula :math:`c^{\cE}_{M}` and :math:`\rho_S({\cE}_{M})` the empirical Spearman correlation of the sample :math:`{\cE}_{M}`.
 
-where :math:`c_M^{\cE}` is the density function of the :class:`~openturns.EmpiricalBernsteinCopula` associated to the sample :math:`\cE` and the bin number :math:`M`, :math:`\hat{D}_f(c^{\cE}_{M})=\dfrac{1}{N}\sum_{j=1}^Nf\left(\dfrac{1}{\vect{U}_j}\right)` a Monte Carlo estimate of the Csiszar :math:`f` divergence, :math:`\rho_S(c^{\cE}_{M})` the exact Spearman correlation of the empirical Bernstein copula :math:`c^{\cE}_{M}` and :math:`\rho_S({\cE}_{M})` the empirical Spearman correlation of the sample :math:`{\cE}_{M}`.
+The parameter :math:`N` is controlled by the *BernsteinCopulaFactory-SamplingSize* key in :class:`~openturns.ResourceMap`.
 
-The parameter :math:`N` is controlled by the *'BernsteinCopulaFactory-SamplingSize'* key in :class:`~openturns.ResourceMap`.
+Note that this optimal :math:`m` does not necessarily divide the sample size :math:`\sampleSize`.
 "