add bench example for machine learning images

Signed-off-by: Bin Tang <[email protected]>
nydusaccelerator · Apr 3, 2023 · 677ae05 · 677ae05
1 parent 2a2e380
commit 677ae05
Show file tree

Hide file tree

Showing 7 changed files with 187 additions and 17 deletions.
diff --git a/bench_ml.yaml b/bench_ml.yaml
@@ -0,0 +1,57 @@
+CMD_ARG:
+- bench_args:
+    envs:
+    - key: NVIDIA_VISIBLE_DEVICES
+      value: 'all'
+    arg: ./test.sh
+    stdin_sh: sh
+    mount:
+    - container_path: /pytorch
+      host_path: misc/mount/pytorch
+    runtime: nvidia-container-runtime
+    shm_size: 8gb
+    work_dir: /pytorch
+  category: machine-learning
+  image: pytorch
+  repo: ml_platform
+- bench_args:
+    envs:
+    - key: NVIDIA_VISIBLE_DEVICES
+      value: 'all'
+    arg: python tf_cnn_benchmarks.py --batch_size=32 --model=resnet152_v2 --variable_update=parameter_server
+    stdin_sh: sh
+    mount:
+    - container_path: /tf_cnn_benchmarks
+      host_path: misc/mount/tf_cnn_benchmarks
+    runtime: nvidia-container-runtime
+    shm_size: 8gb
+    work_dir: /tf_cnn_benchmarks
+  category: machine-learning
+  image: tensorflow
+  repo: ml_platform
+- bench_args:
+    envs:
+    - key: NVIDIA_VISIBLE_DEVICES
+      value: 'all'
+    runtime: nvidia-container-runtime
+    shm_size: 8gb
+  category: machine-learning
+  image: chat-bench
+  repo: ml_platform
+
+CMD_URL_WAIT:
+- bench_args:
+    envs:
+    - key: NVIDIA_VISIBLE_DEVICES
+      value: 'all'
+    arg: tritonserver --model-repository=/models
+    stdin_sh: sh
+    mount:
+    - container_path: /models
+      host_path: misc/mount/model_repository
+    runtime: nvidia-container-runtime
+    shm_size: 8gb
+    wait_url: http://127.0.0.1:8000/v2/health/ready
+  category: machine-learning
+  image: tritonserver
+  repo: ml_platform
diff --git a/hello.py b/hello.py
@@ -132,7 +132,7 @@ def timer(cmd):
 
 class RunArgs:
     def __init__(
-        self, env={}, arg="", stdin="", stdin_sh="sh", waitline="", mount=[], waitURL=""
+        self, env={}, arg="", stdin="", stdin_sh="sh", waitline="", mount=[], waitURL="", runtime="", shmSize="", workDir=""
     ):
         self.env = env
         self.arg = arg
@@ -141,6 +141,9 @@ def __init__(
         self.waitline = waitline
         self.mount = mount
         self.waitURL = waitURL
+        self.runtime = runtime
+        self.shmSize = shmSize
+        self.workDir = workDir
 
 
 class Docker:
@@ -281,14 +284,17 @@ def load_bench_config(self):
                 args = line["bench_args"]
                 print(f"CMD_ARG_WAIT image: {name}, args: {args}")
                 cmd_arg_wait_runner[name] = RunArgs(
-                    env=args["envs"] if "envs" in args else {},
+                    env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
                     waitline=args["wait_line"] if "wait_line" in args else "",
                     mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
                     if "mount" in args
                     else [],
                     arg=args["arg"] if "arg" in args else "",
                     stdin=args["stdin"] if "stdin" in args else "",
                     stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
+                    runtime=args["runtime"] if "runtime" in args else "",
+                    shmSize=args["shm_size"] if "shm_size" in args else "",
+                    workDir=args["work_dir"] if "work_dir" in args else "",
                 )
                 cmd_arg_wait[name] = Bench(name, line["category"])
 
@@ -300,13 +306,16 @@ def load_bench_config(self):
                 args = line["bench_args"]
                 print(f"CMD_STDIN image: {name}, args: {args}")
                 cmd_stdin_runner[name] = RunArgs(
-                    env=args["envs"] if "envs" in args else {},
+                    env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
                     mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
                     if "mount" in args
                     else [],
                     arg=args["arg"] if "arg" in args else "",
                     stdin=args["stdin"] if "stdin" in args else "",
                     stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
+                    runtime=args["runtime"] if "runtime" in args else "",
+                    shmSize=args["shm_size"] if "shm_size" in args else "",
+                    workDir=args["work_dir"] if "work_dir" in args else "",
                 )
                 cmd_stdin[name] = Bench(name, line["category"])
 
@@ -318,13 +327,16 @@ def load_bench_config(self):
                 args = line["bench_args"]
                 print(f"CMD_ARG image: {name}, args: {args}")
                 cmd_arg_runner[name] = RunArgs(
-                    env=args["envs"] if "envs" in args else {},
+                    env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
                     mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
                     if "mount" in args
                     else [],
                     arg=args["arg"] if "arg" in args else "",
                     stdin=args["stdin"] if "stdin" in args else "",
                     stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
+                    runtime=args["runtime"] if "runtime" in args else "",
+                    shmSize=args["shm_size"] if "shm_size" in args else "",
+                    workDir=args["work_dir"] if "work_dir" in args else "",
                 )
                 cmd_arg[name] = Bench(name, line["category"])
 
@@ -336,14 +348,17 @@ def load_bench_config(self):
                 args = line["bench_args"]
                 print(f"CMD_URL_WAIT image: {name}, args: {args}")
                 cmd_url_wait_runner[name] = RunArgs(
-                    env=args["envs"] if "envs" in args else {},
+                    env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
                     waitURL=args["wait_url"] if "wait_url" in args else "",
                     mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
                     if "mount" in args
                     else [],
                     arg=args["arg"] if "arg" in args else "",
                     stdin=args["stdin"] if "stdin" in args else "",
                     stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
+                    runtime=args["runtime"] if "runtime" in args else "",
+                    shmSize=args["shm_size"] if "shm_size" in args else "",
+                    workDir=args["work_dir"] if "work_dir" in args else "",
                 )
                 cmd_url_wait[name] = Bench(name, line["category"])
 
@@ -391,8 +406,6 @@ def run_echo_hello(self, repo: str):
         return pull_elapsed, create_elapsed, run_elapsed
 
     def run_cmd_arg(self, repo, runargs):
-        assert len(runargs.mount) == 0
-
         image_ref = self.image_ref(repo)
         container_name = repo.replace(":", "-") + random_chars()
 
@@ -593,7 +606,21 @@ def create_echo_hello_cmd(self, image_ref, container_id):
         return f"nerdctl --snapshotter {self.snapshotter} create --net=host --name={container_id} {image_ref} -- echo hello"
 
     def create_cmd_arg_cmd(self, image_ref, container_id, runargs):
-        cmd = f"nerdctl --snapshotter {self.snapshotter} create --net=host --name={container_id} {image_ref} "
+        cmd = f"nerdctl --snapshotter {self.snapshotter} create --net=host "
+        if len(runargs.env) > 0:
+            env = " ".join(["--env %s=%s" % (k, v) for k, v in runargs.env.items()])
+            cmd += f" {env} "
+        for a, b in runargs.mount:
+            a = os.path.join(os.path.dirname(os.path.abspath(__file__)), a)
+            a = tmp_copy(a)
+            cmd += f"--volume {a}:{b} "
+        if len(runargs.runtime) > 0:
+            cmd += f"--runtime {runargs.runtime} "
+        if len(runargs.shmSize) > 0:
+            cmd += f"--shm-size {runargs.shmSize} "
+        if len(runargs.workDir) > 0:
+            cmd += f"-w {runargs.workDir} "
+        cmd += f"--name={container_id} {image_ref} "
         return cmd + runargs.arg
 
     def create_cmd_arg_wait_cmd(self, image_ref, container_id, runargs):
@@ -605,9 +632,15 @@ def create_cmd_arg_wait_cmd(self, image_ref, container_id, runargs):
             a = os.path.join(os.path.dirname(os.path.abspath(__file__)), a)
             a = tmp_copy(a)
             cmd += f"--volume {a}:{b} "
-        cmd += f"--name={container_id} {image_ref}"
+        cmd += f"--name={container_id} {image_ref} "
+        if len(runargs.runtime) > 0:
+            cmd += f"--runtime {runargs.runtime} "
+        if len(runargs.shmSize) > 0:
+            cmd += f"--shm-size {runargs.shmSize} "
+        if len(runargs.workDir) > 0:
+            cmd += f"-w {runargs.workDir} "
         if len(runargs.arg) > 0:
-            cmd += f" -- {runargs.arg} "
+            cmd += f"{runargs.arg} "
 
         return cmd
 
@@ -617,9 +650,15 @@ def create_cmd_stdin_cmd(self, image_ref, container_id, runargs):
             a = os.path.join(os.path.dirname(os.path.abspath(__file__)), a)
             a = tmp_copy(a)
             cmd += f"--volume {a}:{b} "
-        cmd += f"--name={container_id} {image_ref}"
+        cmd += f"--name={container_id} {image_ref} "
+        if len(runargs.runtime) > 0:
+            cmd += f"--runtime {runargs.runtime} "
+        if len(runargs.shmSize) > 0:
+            cmd += f"--shm-size {runargs.shmSize} "
+        if len(runargs.workDir) > 0:
+            cmd += f"-w {runargs.workDir} "
         if runargs.stdin_sh:
-            cmd += f" -- {runargs.stdin_sh}"  # e.g., sh -c
+            cmd += f"-- {runargs.stdin_sh}"  # e.g., sh -c
         return cmd
 
     def create_cmd_url_wait_cmd(self, image_ref, container_id, runargs):
@@ -631,9 +670,15 @@ def create_cmd_url_wait_cmd(self, image_ref, container_id, runargs):
         if len(runargs.env) > 0:
             env = " ".join([f"--env {k}={v}" for k, v in runargs.env.items()])
             cmd += f" {env} "
-        cmd += f"--name={container_id} {image_ref}"
+        if len(runargs.runtime) > 0:
+            cmd += f"--runtime {runargs.runtime} "
+        if len(runargs.shmSize) > 0:
+            cmd += f"--shm-size {runargs.shmSize} "
+        if len(runargs.workDir) > 0:
+            cmd += f"-w {runargs.workDir} "
+        cmd += f"--name={container_id} {image_ref} "
         if len(runargs.arg) > 0:
-            cmd += f" -- {runargs.arg} "
+            cmd += f"{runargs.arg} "
         return cmd
 
     def task_start_cmd(self, container_id, iteration: bool):

diff --git a/misc/ml/chat-bench/Dockerfile b/misc/ml/chat-bench/Dockerfile
@@ -0,0 +1,11 @@
+FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+
+ADD applications/Chat /application/chat
+ADD transformers /application/transformers
+WORKDIR /application
+
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+RUN cd /application/chat && pip install .
+RUN cd /application/transformers && pip install .
+
+ENTRYPOINT [ "/application/chat/benchmarks/benchmark_gpt_dummy.sh" ]
diff --git a/misc/ml/chat-bench/build.sh b/misc/ml/chat-bench/build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+DIR=$(dirname "$0")
+
+# clone ColossalAI
+git clone github.com/hpcaitech/ColossalAI.git
+
+# build chat-bench
+sudo docker build -t ml_platform/chat-bench:2.0_cu117 ${DIR}
diff --git a/ml_image_list.txt b/ml_image_list.txt
@@ -0,0 +1,5 @@
+tritonserver:23.02-py3
+pytorch:1.11_cu115
+pytorch:1.11_cu113
+tensorflow:2.4
+chat-bench:2.0_cu117
diff --git a/ml_prepare.sh b/ml_prepare.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+function download_pytorch_benchmark() {
+    target=$(pwd)/misc/mount/pytorch
+    if [[ -d ${target} ]];then
+        echo "${target} already exists"
+        return
+    fi
+    git clone https://github.com/JunhongXu/pytorch-benchmark-volta.git $(pwd)/misc/mount/pytorch-benchmark-volta
+    rm -rf $(pwd)/misc/mount/pytorch-benchmark-volta/.git
+    mv $(pwd)/misc/mount/pytorch-benchmark-volta ${target}
+}
+
+function download_tf_benchmark() {
+    target=$(pwd)/misc/mount/tf_cnn_benchmarks
+    if [[ -d ${target} ]];then
+        echo "${target} already exists"
+        return
+    fi
+    git clone https://github.com/tensorflow/benchmarks.git $(pwd)/misc/mount/benchmarks
+    mv $(pwd)/misc/mount/benchmarks/scripts/tf_cnn_benchmarks ${target}
+    rm -rf $(pwd)/misc/mount/benchmarks
+}
+
+function download_model_repository() {
+    target=$(pwd)/misc/mount/model_repository
+    if [[ -d ${target} ]];then
+        echo "${target} already exists"
+        return
+    fi
+    mkdir -p ${target}/inception_graphdef/1
+    wget -O /tmp/inception_v3_2016_08_28_frozen.pb.tar.gz \
+        https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz
+    (cd /tmp && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz)
+    mv /tmp/inception_v3_2016_08_28_frozen.pb ${target}/inception_graphdef/1/model.graphdef
+
+    mkdir -p ${target}/densenet_onnx/1
+    wget -O ${target}/densenet_onnx/1/model.onnx \
+        https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx
+}
+
+download_pytorch_benchmark
+download_tf_benchmark
+download_model_repository
diff --git a/run.sh b/run.sh
@@ -129,15 +129,14 @@ function run() {
     sudo nerdctl ps -a | awk 'NR>1 {print $1}' | xargs sudo nerdctl rm >/dev/null 2>&1
     sudo nerdctl container prune -f
     sudo nerdctl image prune -f --all
-    sudo systemctl restart nydus-snapshotter
     sleep 1
 
     echo "[INFO] Run hello bench in ${image} ..."
     sudo nerdctl --snapshotter overlayfs rmi -f ${TARGET_REGISTRY}/${image} >/dev/null 2>&1
     result=$(sudo ./hello.py --bench-config=${BENCH_CONFIG} --engine nerdctl --snapshotter overlayfs --op run \
         --registry=${TARGET_REGISTRY} \
         --images ${image} |
-        grep "repo")
+        grep "repo" | grep "bench" | grep "timestamp")
     echo ${result}
     echo ${result} >>${RESULT_DIR}/${RESULT_FILE}.${CURRENT_ROUND}
     echo "[INFO] Remove image ${TARGET_REGISTRY}/${image} ..."
@@ -148,7 +147,7 @@ function run() {
     result=$(sudo ./hello.py --bench-config=${BENCH_CONFIG} --engine nerdctl --snapshotter nydus --op run \
         --registry=${TARGET_REGISTRY} \
         --images ${name}:${tag}-nydusv6 |
-        grep "repo")
+        grep "repo" | grep "bench" | grep "timestamp")
     echo ${result}
     echo ${result} >>${RESULT_DIR}/${RESULT_FILE}.${CURRENT_ROUND}
     echo "[INFO] Remove image ${TARGET_REGISTRY}/${name}:${tag}-nydusv6 ..."