From f8c800195cf60cda9121b6beb20e7b086d12b284 Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk Date: Tue, 15 Oct 2024 15:37:43 +0000 Subject: [PATCH 1/4] Account that k8s pods may be uncounted yet. --- lib/galaxy/jobs/runners/kubernetes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index ce94e12e5673..6a60065de3d4 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -772,7 +772,10 @@ def check_watched_item(self, job_state): # as probably this means that the k8s API server hasn't # had time to fill in the object status since the # job was created only too recently. - if len(job.obj["status"]) == 0: + # It is possible that k8s didn't account for the status of the pods + # and they are in the uncountedTerminatedPods status. In this + # case we also need to wait a moment + if len(job.obj["status"]) == 0 or 'uncountedTerminatedPods' in job.obj["status"]: return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"] From 4b8b23a3511a22843ddbf8ecb011109255c74e09 Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk Date: Tue, 15 Oct 2024 17:00:23 +0000 Subject: [PATCH 2/4] Fix linting --- lib/galaxy/jobs/runners/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index 6a60065de3d4..fcef3b00b2c3 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -775,7 +775,7 @@ def check_watched_item(self, job_state): # It is possible that k8s didn't account for the status of the pods # and they are in the uncountedTerminatedPods status. In this # case we also need to wait a moment - if len(job.obj["status"]) == 0 or 'uncountedTerminatedPods' in job.obj["status"]: + if len(job.obj["status"]) == 0 or "uncountedTerminatedPods" in job.obj["status"]: return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"] From 2944d57257246c5d24e574711e38c74858ed6909 Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk <125464188+mapk-amazon@users.noreply.github.com> Date: Wed, 16 Oct 2024 20:33:45 +0200 Subject: [PATCH 3/4] Update lib/galaxy/jobs/runners/kubernetes.py Co-authored-by: Marius van den Beek --- lib/galaxy/jobs/runners/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index fcef3b00b2c3..bc85bd85669c 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -775,7 +775,7 @@ def check_watched_item(self, job_state): # It is possible that k8s didn't account for the status of the pods # and they are in the uncountedTerminatedPods status. In this # case we also need to wait a moment - if len(job.obj["status"]) == 0 or "uncountedTerminatedPods" in job.obj["status"]: + if len(job.obj["status"]) == 0 or in job.obj["status"].get("uncountedTerminatedPods"): return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"] From 3576afd23209b3f305bf1e8e246c384129c1a5df Mon Sep 17 00:00:00 2001 From: Matt Pawelczyk <125464188+mapk-amazon@users.noreply.github.com> Date: Wed, 16 Oct 2024 21:02:27 +0200 Subject: [PATCH 4/4] Update lib/galaxy/jobs/runners/kubernetes.py Co-authored-by: Nuwan Goonasekera <2070605+nuwang@users.noreply.github.com> --- lib/galaxy/jobs/runners/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index bc85bd85669c..7d2180d8253d 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -775,7 +775,7 @@ def check_watched_item(self, job_state): # It is possible that k8s didn't account for the status of the pods # and they are in the uncountedTerminatedPods status. In this # case we also need to wait a moment - if len(job.obj["status"]) == 0 or in job.obj["status"].get("uncountedTerminatedPods"): + if len(job.obj["status"]) == 0 or job.obj["status"].get("uncountedTerminatedPods"): return job_state if "succeeded" in job.obj["status"]: succeeded = job.obj["status"]["succeeded"]