From 9b3bc4afbaf722752b151b4cc2244fc3eac92c77 Mon Sep 17 00:00:00 2001 From: Scott Votaw Date: Sat, 4 Nov 2023 14:29:19 -0700 Subject: [PATCH] Improve LightGBM Network logs --- .../synapse/ml/lightgbm/NetworkManager.scala | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala index 4644d49e2b..89aef07a50 100644 --- a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala +++ b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala @@ -125,11 +125,11 @@ object NetworkManager { } private def getNetworkTopologyInfoFromDriver(networkParams: NetworkParams, - taskId: Long, - partitionId: Int, - localListenPort: Int, - log: Logger, - shouldExecuteTraining: Boolean): NetworkTopologyInfo = { + taskId: Long, + partitionId: Int, + localListenPort: Int, + log: Logger, + shouldExecuteTraining: Boolean): NetworkTopologyInfo = { using(new Socket(networkParams.ipAddress, networkParams.port)) { driverSocket => usingMany(Seq(new BufferedReader(new InputStreamReader(driverSocket.getInputStream)), @@ -163,6 +163,12 @@ object NetworkManager { // and a list of partition ids in this executor. val lightGbmMachineList = driverInput.readLine() val partitionsByExecutorStr = driverInput.readLine() + if (partitionsByExecutorStr == null || lightGbmMachineList == null) { + val message = s"Received bad network information. Task $taskId, partition $partitionId received" + + s"partition topology: '$partitionsByExecutorStr', nodes for network init: '$lightGbmMachineList'" + throw new Exception(message) + } + log.info(s"task $taskId, partition $partitionId received partition topology: '$partitionsByExecutorStr'") log.info(s"task $taskId, partition $partitionId received nodes for network init: '$lightGbmMachineList'") val executorPartitionIds: Array[Int] =