Skip to content

Commit

Permalink
Fix estimation of censoring distribution for tied times with events
Browse files Browse the repository at this point in the history
Adds a reverse argument to kaplan_meier_estimator

When estimating the censoring distribution, we now consider events
to occur before censoring. For tied time points with an event, those
with an event are not considered at risk anymore and subtracted from
the demoniator of the Kaplan-Meier estimator.

The change affects all functions relying on inverse probability
of censoring weights, namely:
- sksurv.nonparametric.CensoringDistributionEstimator
- sksurv.nonparametric.ipc_weights
- sksurv.linear_model.IPCRidge
- sksurv.metrics.cumulative_dynamic_auc
- sksurv.metrics.concordance_index_ipcw
  • Loading branch information
sebp committed Jun 3, 2020
1 parent b71ba66 commit 3bb20a7
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 16 deletions.
2 changes: 1 addition & 1 deletion sksurv/linear_model/coxph.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def fit(self, linear_predictor, event, time):
risk_score = numpy.exp(linear_predictor)
order = numpy.argsort(time, kind="mergesort")
risk_score = risk_score[order]
uniq_times, n_events, n_at_risk = _compute_counts(event, time, order)
uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time, order)

divisor = numpy.empty(n_at_risk.shape, dtype=numpy.float_)
value = numpy.sum(risk_score)
Expand Down
33 changes: 26 additions & 7 deletions sksurv/nonparametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def _compute_counts(event, time, order=None):
Number of events at each time point.
n_at_risk : array
Number of samples that are censored or have an event at each time point.
Number of samples that have not been censored or have not had an event at each time point.
n_censored : array
Number of censored samples at each time point.
"""
n_samples = event.shape[0]

Expand Down Expand Up @@ -86,12 +89,13 @@ def _compute_counts(event, time, order=None):
times = numpy.resize(uniq_times, j)
n_events = numpy.resize(uniq_events, j)
total_count = numpy.resize(uniq_counts, j)
n_censored = total_count - n_events

# offset cumulative sum by one
total_count = numpy.concatenate(([0], total_count))
n_at_risk = n_samples - numpy.cumsum(total_count)

return times, n_events, n_at_risk[:-1]
return times, n_events, n_at_risk[:-1], n_censored


def _compute_counts_truncated(event, time_enter, time_exit):
Expand Down Expand Up @@ -167,7 +171,7 @@ def _compute_counts_truncated(event, time_enter, time_exit):
return uniq_times, event_counts, total_counts


def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None):
def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None, reverse=False):
"""Kaplan-Meier estimator of survival function.
See [1]_ for further description.
Expand All @@ -188,6 +192,13 @@ def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None):
Compute estimator conditional on survival at least up to
the specified time.
reverse : bool, optional, default: False
Whether to estimate the censoring distribution.
When there are ties between times at which events are observed,
then events come first and are subtracted from the denominator.
Only available for right-censored data, i.e. `time_enter` must
be None.
Returns
-------
time : array, shape = (n_times,)
Expand Down Expand Up @@ -215,11 +226,19 @@ def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None):
check_consistent_length(event, time_enter, time_exit)

if time_enter is None:
uniq_times, n_events, n_at_risk = _compute_counts(event, time_exit)
uniq_times, n_events, n_at_risk, n_censored = _compute_counts(event, time_exit)

if reverse:
n_at_risk -= n_events
n_events = n_censored
else:
if reverse:
raise ValueError("The censoring distribution cannot be estimated from left truncated data")

uniq_times, n_events, n_at_risk = _compute_counts_truncated(event, time_enter, time_exit)

values = 1 - n_events / n_at_risk
values[n_events == 0] = 1.0 # in case of 0/0 = nan

if time_min is not None:
mask = uniq_times >= time_min
Expand Down Expand Up @@ -261,7 +280,7 @@ def nelson_aalen_estimator(event, time):
"""
event, time = check_y_survival(event, time)
check_consistent_length(event, time)
uniq_times, n_events, n_at_risk = _compute_counts(event, time)
uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time)

y = numpy.cumsum(n_events / n_at_risk)

Expand All @@ -287,7 +306,7 @@ def ipc_weights(event, time):
if event.all():
return numpy.ones(time.shape[0])

unique_time, p = kaplan_meier_estimator(~event, time)
unique_time, p = kaplan_meier_estimator(event, time, reverse=True)

idx = numpy.searchsorted(unique_time, time[event])
Ghat = p[idx]
Expand Down Expand Up @@ -390,7 +409,7 @@ def fit(self, y):
self.unique_time_ = numpy.unique(time)
self.prob_ = numpy.ones(self.unique_time_.shape[0])
else:
unique_time, prob = kaplan_meier_estimator(~event, time)
unique_time, prob = kaplan_meier_estimator(event, time, reverse=True)
self.unique_time_ = numpy.concatenate(([-numpy.infty], unique_time))
self.prob_ = numpy.concatenate(([1.], prob))

Expand Down
8 changes: 4 additions & 4 deletions tests/test_aft.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ def test_fit(make_whas500):
model = IPCRidge()
model.fit(whas500.x, whas500.y)

assert round(abs(model.intercept_ - 5.8673567124629571), 7) == 0
expected = numpy.array([0.168517, -0.249717, 2.18515, 0.536795, -0.514571, 0.091203,
0.613006, 0.480385, -0.055949, 0.238529, -0.127148, -0.144134,
-1.625041, -0.217469])
assert round(abs(model.intercept_ - 5.867520370855396), 7) == 0
expected = numpy.array([0.168481, -0.24962, 2.185086, 0.53682, -0.514611, 0.09124,
0.613114, 0.480357, -0.055972, 0.238472, -0.127209, -0.144063,
-1.625081, -0.217591])
assert_array_almost_equal(model.coef_, expected)

@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,17 +304,17 @@ def uno_c_data(request, whas500_pred):
event=[False, True, False, True, True, False, True, False, False, True],
time=[1, 5, 6, 11, 11, 34, 45, 45, 50, 55])
estimate = (5, 8, 11, 19, 34, 12, 3, 9, 12, 18)
expected = (0.4401618580, 11, 10, 0, 1)
expected = (0.4036321031048623, 11, 10, 0, 1)
elif p == 'tied_event_and_time':
y = Surv.from_arrays(
event=[True, False, False, False, True, False, True, True, False, False, False, True, True],
time=[34, 11, 11, 5, 1, 89, 13, 45, 7, 13, 9, 13, 90])
estimate = (1, 19, 13, 13, 15, 14, 19, 23, 11, 10, 11, 1, 18)
expected = (0.4722222222, 14, 12, 1, 2)
expected = (0.46795357052737824, 14, 12, 1, 2)
elif p == 'whas500':
event, time, estimate = whas500_pred
y = Surv.from_arrays(event, time)
expected = (0.7929001679258981, 57849, 17300, 0, 14)
expected = (0.7929275009049014, 57849, 17300, 0, 14)

y_train = y if y_train is None else y_train
y_test = y if y_test is None else y_test
Expand Down
29 changes: 28 additions & 1 deletion tests/test_nonparametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
import pandas
import pytest

from sksurv.nonparametric import kaplan_meier_estimator, nelson_aalen_estimator, SurvivalFunctionEstimator
from sksurv.nonparametric import (
CensoringDistributionEstimator,
kaplan_meier_estimator,
nelson_aalen_estimator,
SurvivalFunctionEstimator)
from sksurv.util import Surv

CHANNING_FILE = join(dirname(__file__), 'data', 'channing.csv')
Expand Down Expand Up @@ -447,6 +451,29 @@ def test_right_truncated_adults(make_aids):
assert_array_almost_equal(-x[::-1], true_x, 2)
assert_array_almost_equal(y[::-1], true_y, 2)

@staticmethod
def test_censoring_distribution():
y = Surv.from_arrays(numpy.array([1, 0, 0, 1, 0, 1, 0, 1, 1, 0], dtype=bool),
numpy.array([1, 2, 3, 3, 3, 4, 5, 5, 6, 7]))

cens = CensoringDistributionEstimator().fit(y)

probs = cens.predict_proba(numpy.arange(1, 8))
expected = numpy.array([1.0, 0.8888889, 0.6349206, 0.6349206, 0.4232804, 0.4232804, 0.0000000])

assert_array_almost_equal(expected, probs)

@staticmethod
def test_truncated_reverse_error():
rnd = numpy.random.RandomState(2016)
time_exit = rnd.uniform(1, 100, size=25)
time_enter = time_exit + 1
event = rnd.binomial(1, 0.6, size=25).astype(bool)

with pytest.raises(ValueError,
match="The censoring distribution cannot be estimated from left truncated data"):
kaplan_meier_estimator(event, time_exit, time_enter, reverse=True)


class TestNelsonAalen(object):

Expand Down

0 comments on commit 3bb20a7

Please sign in to comment.