Fix estimation of censoring distribution for tied times with events

Adds a reverse argument to kaplan_meier_estimator When estimating the censoring distribution, we now consider events to occur before censoring. For tied time points with an event, those with an event are not considered at risk anymore and subtracted from the demoniator of the Kaplan-Meier estimator. The change affects all functions relying on inverse probability of censoring weights, namely: - sksurv.nonparametric.CensoringDistributionEstimator - sksurv.nonparametric.ipc_weights - sksurv.linear_model.IPCRidge - sksurv.metrics.cumulative_dynamic_auc - sksurv.metrics.concordance_index_ipcw
sebp · Jun 3, 2020 · 3bb20a7 · 3bb20a7
1 parent b71ba66
commit 3bb20a7
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 16 deletions.
diff --git a/sksurv/linear_model/coxph.py b/sksurv/linear_model/coxph.py
@@ -60,7 +60,7 @@ def fit(self, linear_predictor, event, time):
         risk_score = numpy.exp(linear_predictor)
         order = numpy.argsort(time, kind="mergesort")
         risk_score = risk_score[order]
-        uniq_times, n_events, n_at_risk = _compute_counts(event, time, order)
+        uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time, order)
 
         divisor = numpy.empty(n_at_risk.shape, dtype=numpy.float_)
         value = numpy.sum(risk_score)

diff --git a/sksurv/nonparametric.py b/sksurv/nonparametric.py
@@ -49,7 +49,10 @@ def _compute_counts(event, time, order=None):
         Number of events at each time point.
 
     n_at_risk : array
-        Number of samples that are censored or have an event at each time point.
+        Number of samples that have not been censored or have not had an event at each time point.
+
+    n_censored : array
+        Number of censored samples at each time point.
     """
     n_samples = event.shape[0]
 
@@ -86,12 +89,13 @@ def _compute_counts(event, time, order=None):
     times = numpy.resize(uniq_times, j)
     n_events = numpy.resize(uniq_events, j)
     total_count = numpy.resize(uniq_counts, j)
+    n_censored = total_count - n_events
 
     # offset cumulative sum by one
     total_count = numpy.concatenate(([0], total_count))
     n_at_risk = n_samples - numpy.cumsum(total_count)
 
-    return times, n_events, n_at_risk[:-1]
+    return times, n_events, n_at_risk[:-1], n_censored
 
 
 def _compute_counts_truncated(event, time_enter, time_exit):
@@ -167,7 +171,7 @@ def _compute_counts_truncated(event, time_enter, time_exit):
     return uniq_times, event_counts, total_counts
 
 
-def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None):
+def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None, reverse=False):
     """Kaplan-Meier estimator of survival function.
 
     See [1]_ for further description.
@@ -188,6 +192,13 @@ def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None):
         Compute estimator conditional on survival at least up to
         the specified time.
 
+    reverse : bool, optional, default: False
+        Whether to estimate the censoring distribution.
+        When there are ties between times at which events are observed,
+        then events come first and are subtracted from the denominator.
+        Only available for right-censored data, i.e. `time_enter` must
+        be None.
+
     Returns
     -------
     time : array, shape = (n_times,)
@@ -215,11 +226,19 @@ def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None):
     check_consistent_length(event, time_enter, time_exit)
 
     if time_enter is None:
-        uniq_times, n_events, n_at_risk = _compute_counts(event, time_exit)
+        uniq_times, n_events, n_at_risk, n_censored = _compute_counts(event, time_exit)
+
+        if reverse:
+            n_at_risk -= n_events
+            n_events = n_censored
     else:
+        if reverse:
+            raise ValueError("The censoring distribution cannot be estimated from left truncated data")
+
         uniq_times, n_events, n_at_risk = _compute_counts_truncated(event, time_enter, time_exit)
 
     values = 1 - n_events / n_at_risk
+    values[n_events == 0] = 1.0  # in case of 0/0 = nan
 
     if time_min is not None:
         mask = uniq_times >= time_min
@@ -261,7 +280,7 @@ def nelson_aalen_estimator(event, time):
     """
     event, time = check_y_survival(event, time)
     check_consistent_length(event, time)
-    uniq_times, n_events, n_at_risk = _compute_counts(event, time)
+    uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time)
 
     y = numpy.cumsum(n_events / n_at_risk)
 
@@ -287,7 +306,7 @@ def ipc_weights(event, time):
     if event.all():
         return numpy.ones(time.shape[0])
 
-    unique_time, p = kaplan_meier_estimator(~event, time)
+    unique_time, p = kaplan_meier_estimator(event, time, reverse=True)
 
     idx = numpy.searchsorted(unique_time, time[event])
     Ghat = p[idx]
@@ -390,7 +409,7 @@ def fit(self, y):
             self.unique_time_ = numpy.unique(time)
             self.prob_ = numpy.ones(self.unique_time_.shape[0])
         else:
-            unique_time, prob = kaplan_meier_estimator(~event, time)
+            unique_time, prob = kaplan_meier_estimator(event, time, reverse=True)
             self.unique_time_ = numpy.concatenate(([-numpy.infty], unique_time))
             self.prob_ = numpy.concatenate(([1.], prob))
 

diff --git a/tests/test_aft.py b/tests/test_aft.py
@@ -13,10 +13,10 @@ def test_fit(make_whas500):
         model = IPCRidge()
         model.fit(whas500.x, whas500.y)
 
-        assert round(abs(model.intercept_ - 5.8673567124629571), 7) == 0
-        expected = numpy.array([0.168517, -0.249717, 2.18515, 0.536795, -0.514571, 0.091203,
-                                0.613006, 0.480385, -0.055949, 0.238529, -0.127148, -0.144134,
-                                -1.625041, -0.217469])
+        assert round(abs(model.intercept_ - 5.867520370855396), 7) == 0
+        expected = numpy.array([0.168481, -0.24962, 2.185086, 0.53682, -0.514611, 0.09124,
+                                0.613114, 0.480357, -0.055972, 0.238472, -0.127209, -0.144063,
+                                -1.625081, -0.217591])
         assert_array_almost_equal(model.coef_, expected)
 
     @staticmethod

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -304,17 +304,17 @@ def uno_c_data(request, whas500_pred):
             event=[False, True, False, True, True, False, True, False, False, True],
             time=[1, 5, 6, 11, 11, 34, 45, 45, 50, 55])
         estimate = (5, 8, 11, 19, 34, 12, 3, 9, 12, 18)
-        expected = (0.4401618580, 11, 10, 0, 1)
+        expected = (0.4036321031048623, 11, 10, 0, 1)
     elif p == 'tied_event_and_time':
         y = Surv.from_arrays(
             event=[True, False, False, False, True, False, True, True, False, False, False, True, True],
             time=[34, 11, 11, 5, 1, 89, 13, 45, 7, 13, 9, 13, 90])
         estimate = (1, 19, 13, 13, 15, 14, 19, 23, 11, 10, 11, 1, 18)
-        expected = (0.4722222222, 14, 12, 1, 2)
+        expected = (0.46795357052737824, 14, 12, 1, 2)
     elif p == 'whas500':
         event, time, estimate = whas500_pred
         y = Surv.from_arrays(event, time)
-        expected = (0.7929001679258981, 57849, 17300, 0, 14)
+        expected = (0.7929275009049014, 57849, 17300, 0, 14)
 
     y_train = y if y_train is None else y_train
     y_test = y if y_test is None else y_test

diff --git a/tests/test_nonparametric.py b/tests/test_nonparametric.py
@@ -5,7 +5,11 @@
 import pandas
 import pytest
 
-from sksurv.nonparametric import kaplan_meier_estimator, nelson_aalen_estimator, SurvivalFunctionEstimator
+from sksurv.nonparametric import (
+    CensoringDistributionEstimator,
+    kaplan_meier_estimator,
+    nelson_aalen_estimator,
+    SurvivalFunctionEstimator)
 from sksurv.util import Surv
 
 CHANNING_FILE = join(dirname(__file__), 'data', 'channing.csv')
@@ -447,6 +451,29 @@ def test_right_truncated_adults(make_aids):
         assert_array_almost_equal(-x[::-1], true_x, 2)
         assert_array_almost_equal(y[::-1], true_y, 2)
 
+    @staticmethod
+    def test_censoring_distribution():
+        y = Surv.from_arrays(numpy.array([1, 0, 0, 1, 0, 1, 0, 1, 1, 0], dtype=bool),
+                             numpy.array([1, 2, 3, 3, 3, 4, 5, 5, 6, 7]))
+
+        cens = CensoringDistributionEstimator().fit(y)
+
+        probs = cens.predict_proba(numpy.arange(1, 8))
+        expected = numpy.array([1.0, 0.8888889, 0.6349206, 0.6349206, 0.4232804, 0.4232804, 0.0000000])
+
+        assert_array_almost_equal(expected, probs)
+
+    @staticmethod
+    def test_truncated_reverse_error():
+        rnd = numpy.random.RandomState(2016)
+        time_exit = rnd.uniform(1, 100, size=25)
+        time_enter = time_exit + 1
+        event = rnd.binomial(1, 0.6, size=25).astype(bool)
+
+        with pytest.raises(ValueError,
+                           match="The censoring distribution cannot be estimated from left truncated data"):
+            kaplan_meier_estimator(event, time_exit, time_enter, reverse=True)
+
 
 class TestNelsonAalen(object):