-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti-node-spark-on-slurm.sh
71 lines (57 loc) · 2.01 KB
/
multi-node-spark-on-slurm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
# Multiple nodes Spark Cluster on Slurm
# 1. Request Slurm resources
#SBATCH --nodes=4
#SBATCH --mem-per-cpu=4G
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=2
#SBATCH --output=sparkjob-%j.output
# 2. Spark environment
module load spark
export SPARK_ID_STRING=$SLURM_JOBID
export SPARK_WORKER_DIR=$HOME/.spark/worker
export SPARK_LOG_DIR=$HOME/.spark/logs
export SPARK_LOCAL_DIRS=/tmp/spark
echo SPARK_WORKER_DIR
echo $SPARK_WORKER_DIR
echo SPARK_LOG_DIR
echo $SPARK_LOG_DIR
echo SPARK_LOCAL_DIRS
echo $SPARK_LOCAL_DIRS
mkdir -p $SPARK_WORKER_DIR $SPARK_LOG_DIR $SPARK_LOCAL_DIRS
# 3. Start Spark Master
echo "start Spark master..."
$SPARK_HOME/sbin/start-master.sh
sleep 2
MASTER_URL=$(grep -oP '(?=spark://).*' $SPARK_LOG_DIR/spark-${SPARK_ID_STRING}-org.*master*.out)
echo MASTER_URL
echo $MASTER_URL
URL1=$(grep -oP 'http://\K[^ ]+' $SPARK_LOG_DIR/spark-${SPARK_ID_STRING}-org.*master*.out)
HOSTNAME1=$(echo URL1| cut -d. -f1)
IP1=$(scontrol show node $HOSTNAME1 | grep -oP 'NodeAddr=\K[^ ]+' )
PORT1=$(echo $URL1|cut -d: -f2)
echo "----------- Master webUI is : --------------"
echo "http://$IP1:PORT1"
echo "--------------------------------------------"
# 4. Start worker nodes
export SPARK_WORKER_CORES=${SLURM_CPUS_PER_TASK:-1}
export SPARK_MEM=$(( ${SLURM_MEM_PER_CPU:-4096} * ${SLURM_CPUS_PER_TASK:-1} ))M
export SPARK_DAEMON_MEMORY=$SPARK_MEM
export SPARK_WORKER_MEMORY=$SPARK_MEM
export SPARK_EXECUTOR_MEMORY=$SPARK_MEM
# start the workers on each node allocated to the job
export SPARK_NO_DAEMONIZE=1
srun --output=$SPARK_LOG_DIR/spark-%j-workers.out --label start-slave.sh ${MASTER_URL} &
# 5. Submit a task to the Spark cluster
$SPARK_HOME/bin/spark-submit --master ${MASTER_URL} \
--total-executor-cores $((SLURM_NTASKS * SLURM_CPUS_PER_TASK)) \
$SPARK_HOME/examples/src/main/python/pi.py 1000
sleep 2
# 6. Clean up
# Stop Spark worker
scancel ${SLURM_JOBID}.0
sleep 2
# Stop Spark master
$SPARK_HOME/sbin/stop-master.sh
# to submit this script to SLURM
sbatch muli-nodes.sh