Skip to content

Commit

Permalink
add bench example for machine learning images
Browse files Browse the repository at this point in the history
Signed-off-by: Bin Tang <[email protected]>
  • Loading branch information
sctb512 committed Apr 3, 2023
1 parent 2a2e380 commit 677ae05
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 17 deletions.
57 changes: 57 additions & 0 deletions bench_ml.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
CMD_ARG:
- bench_args:
envs:
- key: NVIDIA_VISIBLE_DEVICES
value: 'all'
arg: ./test.sh
stdin_sh: sh
mount:
- container_path: /pytorch
host_path: misc/mount/pytorch
runtime: nvidia-container-runtime
shm_size: 8gb
work_dir: /pytorch
category: machine-learning
image: pytorch
repo: ml_platform
- bench_args:
envs:
- key: NVIDIA_VISIBLE_DEVICES
value: 'all'
arg: python tf_cnn_benchmarks.py --batch_size=32 --model=resnet152_v2 --variable_update=parameter_server
stdin_sh: sh
mount:
- container_path: /tf_cnn_benchmarks
host_path: misc/mount/tf_cnn_benchmarks
runtime: nvidia-container-runtime
shm_size: 8gb
work_dir: /tf_cnn_benchmarks
category: machine-learning
image: tensorflow
repo: ml_platform
- bench_args:
envs:
- key: NVIDIA_VISIBLE_DEVICES
value: 'all'
runtime: nvidia-container-runtime
shm_size: 8gb
category: machine-learning
image: chat-bench
repo: ml_platform

CMD_URL_WAIT:
- bench_args:
envs:
- key: NVIDIA_VISIBLE_DEVICES
value: 'all'
arg: tritonserver --model-repository=/models
stdin_sh: sh
mount:
- container_path: /models
host_path: misc/mount/model_repository
runtime: nvidia-container-runtime
shm_size: 8gb
wait_url: http://127.0.0.1:8000/v2/health/ready
category: machine-learning
image: tritonserver
repo: ml_platform
73 changes: 59 additions & 14 deletions hello.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def timer(cmd):

class RunArgs:
def __init__(
self, env={}, arg="", stdin="", stdin_sh="sh", waitline="", mount=[], waitURL=""
self, env={}, arg="", stdin="", stdin_sh="sh", waitline="", mount=[], waitURL="", runtime="", shmSize="", workDir=""
):
self.env = env
self.arg = arg
Expand All @@ -141,6 +141,9 @@ def __init__(
self.waitline = waitline
self.mount = mount
self.waitURL = waitURL
self.runtime = runtime
self.shmSize = shmSize
self.workDir = workDir


class Docker:
Expand Down Expand Up @@ -281,14 +284,17 @@ def load_bench_config(self):
args = line["bench_args"]
print(f"CMD_ARG_WAIT image: {name}, args: {args}")
cmd_arg_wait_runner[name] = RunArgs(
env=args["envs"] if "envs" in args else {},
env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
waitline=args["wait_line"] if "wait_line" in args else "",
mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
if "mount" in args
else [],
arg=args["arg"] if "arg" in args else "",
stdin=args["stdin"] if "stdin" in args else "",
stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
runtime=args["runtime"] if "runtime" in args else "",
shmSize=args["shm_size"] if "shm_size" in args else "",
workDir=args["work_dir"] if "work_dir" in args else "",
)
cmd_arg_wait[name] = Bench(name, line["category"])

Expand All @@ -300,13 +306,16 @@ def load_bench_config(self):
args = line["bench_args"]
print(f"CMD_STDIN image: {name}, args: {args}")
cmd_stdin_runner[name] = RunArgs(
env=args["envs"] if "envs" in args else {},
env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
if "mount" in args
else [],
arg=args["arg"] if "arg" in args else "",
stdin=args["stdin"] if "stdin" in args else "",
stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
runtime=args["runtime"] if "runtime" in args else "",
shmSize=args["shm_size"] if "shm_size" in args else "",
workDir=args["work_dir"] if "work_dir" in args else "",
)
cmd_stdin[name] = Bench(name, line["category"])

Expand All @@ -318,13 +327,16 @@ def load_bench_config(self):
args = line["bench_args"]
print(f"CMD_ARG image: {name}, args: {args}")
cmd_arg_runner[name] = RunArgs(
env=args["envs"] if "envs" in args else {},
env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
if "mount" in args
else [],
arg=args["arg"] if "arg" in args else "",
stdin=args["stdin"] if "stdin" in args else "",
stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
runtime=args["runtime"] if "runtime" in args else "",
shmSize=args["shm_size"] if "shm_size" in args else "",
workDir=args["work_dir"] if "work_dir" in args else "",
)
cmd_arg[name] = Bench(name, line["category"])

Expand All @@ -336,14 +348,17 @@ def load_bench_config(self):
args = line["bench_args"]
print(f"CMD_URL_WAIT image: {name}, args: {args}")
cmd_url_wait_runner[name] = RunArgs(
env=args["envs"] if "envs" in args else {},
env=dict([(item["key"], item["value"]) for item in args["envs"]]) if "envs" in args else {},
waitURL=args["wait_url"] if "wait_url" in args else "",
mount=[(m["host_path"], m["container_path"]) for m in args["mount"]]
if "mount" in args
else [],
arg=args["arg"] if "arg" in args else "",
stdin=args["stdin"] if "stdin" in args else "",
stdin_sh=args["stdin_sh"] if "stdin_sh" in args else "",
runtime=args["runtime"] if "runtime" in args else "",
shmSize=args["shm_size"] if "shm_size" in args else "",
workDir=args["work_dir"] if "work_dir" in args else "",
)
cmd_url_wait[name] = Bench(name, line["category"])

Expand Down Expand Up @@ -391,8 +406,6 @@ def run_echo_hello(self, repo: str):
return pull_elapsed, create_elapsed, run_elapsed

def run_cmd_arg(self, repo, runargs):
assert len(runargs.mount) == 0

image_ref = self.image_ref(repo)
container_name = repo.replace(":", "-") + random_chars()

Expand Down Expand Up @@ -593,7 +606,21 @@ def create_echo_hello_cmd(self, image_ref, container_id):
return f"nerdctl --snapshotter {self.snapshotter} create --net=host --name={container_id} {image_ref} -- echo hello"

def create_cmd_arg_cmd(self, image_ref, container_id, runargs):
cmd = f"nerdctl --snapshotter {self.snapshotter} create --net=host --name={container_id} {image_ref} "
cmd = f"nerdctl --snapshotter {self.snapshotter} create --net=host "
if len(runargs.env) > 0:
env = " ".join(["--env %s=%s" % (k, v) for k, v in runargs.env.items()])
cmd += f" {env} "
for a, b in runargs.mount:
a = os.path.join(os.path.dirname(os.path.abspath(__file__)), a)
a = tmp_copy(a)
cmd += f"--volume {a}:{b} "
if len(runargs.runtime) > 0:
cmd += f"--runtime {runargs.runtime} "
if len(runargs.shmSize) > 0:
cmd += f"--shm-size {runargs.shmSize} "
if len(runargs.workDir) > 0:
cmd += f"-w {runargs.workDir} "
cmd += f"--name={container_id} {image_ref} "
return cmd + runargs.arg

def create_cmd_arg_wait_cmd(self, image_ref, container_id, runargs):
Expand All @@ -605,9 +632,15 @@ def create_cmd_arg_wait_cmd(self, image_ref, container_id, runargs):
a = os.path.join(os.path.dirname(os.path.abspath(__file__)), a)
a = tmp_copy(a)
cmd += f"--volume {a}:{b} "
cmd += f"--name={container_id} {image_ref}"
cmd += f"--name={container_id} {image_ref} "
if len(runargs.runtime) > 0:
cmd += f"--runtime {runargs.runtime} "
if len(runargs.shmSize) > 0:
cmd += f"--shm-size {runargs.shmSize} "
if len(runargs.workDir) > 0:
cmd += f"-w {runargs.workDir} "
if len(runargs.arg) > 0:
cmd += f" -- {runargs.arg} "
cmd += f"{runargs.arg} "

return cmd

Expand All @@ -617,9 +650,15 @@ def create_cmd_stdin_cmd(self, image_ref, container_id, runargs):
a = os.path.join(os.path.dirname(os.path.abspath(__file__)), a)
a = tmp_copy(a)
cmd += f"--volume {a}:{b} "
cmd += f"--name={container_id} {image_ref}"
cmd += f"--name={container_id} {image_ref} "
if len(runargs.runtime) > 0:
cmd += f"--runtime {runargs.runtime} "
if len(runargs.shmSize) > 0:
cmd += f"--shm-size {runargs.shmSize} "
if len(runargs.workDir) > 0:
cmd += f"-w {runargs.workDir} "
if runargs.stdin_sh:
cmd += f" -- {runargs.stdin_sh}" # e.g., sh -c
cmd += f"-- {runargs.stdin_sh}" # e.g., sh -c
return cmd

def create_cmd_url_wait_cmd(self, image_ref, container_id, runargs):
Expand All @@ -631,9 +670,15 @@ def create_cmd_url_wait_cmd(self, image_ref, container_id, runargs):
if len(runargs.env) > 0:
env = " ".join([f"--env {k}={v}" for k, v in runargs.env.items()])
cmd += f" {env} "
cmd += f"--name={container_id} {image_ref}"
if len(runargs.runtime) > 0:
cmd += f"--runtime {runargs.runtime} "
if len(runargs.shmSize) > 0:
cmd += f"--shm-size {runargs.shmSize} "
if len(runargs.workDir) > 0:
cmd += f"-w {runargs.workDir} "
cmd += f"--name={container_id} {image_ref} "
if len(runargs.arg) > 0:
cmd += f" -- {runargs.arg} "
cmd += f"{runargs.arg} "
return cmd

def task_start_cmd(self, container_id, iteration: bool):
Expand Down
11 changes: 11 additions & 0 deletions misc/ml/chat-bench/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel

ADD applications/Chat /application/chat
ADD transformers /application/transformers
WORKDIR /application

RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
RUN cd /application/chat && pip install .
RUN cd /application/transformers && pip install .

ENTRYPOINT [ "/application/chat/benchmarks/benchmark_gpt_dummy.sh" ]
9 changes: 9 additions & 0 deletions misc/ml/chat-bench/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

DIR=$(dirname "$0")

# clone ColossalAI
git clone github.com/hpcaitech/ColossalAI.git

# build chat-bench
sudo docker build -t ml_platform/chat-bench:2.0_cu117 ${DIR}
5 changes: 5 additions & 0 deletions ml_image_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tritonserver:23.02-py3
pytorch:1.11_cu115
pytorch:1.11_cu113
tensorflow:2.4
chat-bench:2.0_cu117
44 changes: 44 additions & 0 deletions ml_prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

function download_pytorch_benchmark() {
target=$(pwd)/misc/mount/pytorch
if [[ -d ${target} ]];then
echo "${target} already exists"
return
fi
git clone https://github.com/JunhongXu/pytorch-benchmark-volta.git $(pwd)/misc/mount/pytorch-benchmark-volta
rm -rf $(pwd)/misc/mount/pytorch-benchmark-volta/.git
mv $(pwd)/misc/mount/pytorch-benchmark-volta ${target}
}

function download_tf_benchmark() {
target=$(pwd)/misc/mount/tf_cnn_benchmarks
if [[ -d ${target} ]];then
echo "${target} already exists"
return
fi
git clone https://github.com/tensorflow/benchmarks.git $(pwd)/misc/mount/benchmarks
mv $(pwd)/misc/mount/benchmarks/scripts/tf_cnn_benchmarks ${target}
rm -rf $(pwd)/misc/mount/benchmarks
}

function download_model_repository() {
target=$(pwd)/misc/mount/model_repository
if [[ -d ${target} ]];then
echo "${target} already exists"
return
fi
mkdir -p ${target}/inception_graphdef/1
wget -O /tmp/inception_v3_2016_08_28_frozen.pb.tar.gz \
https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz
(cd /tmp && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz)
mv /tmp/inception_v3_2016_08_28_frozen.pb ${target}/inception_graphdef/1/model.graphdef

mkdir -p ${target}/densenet_onnx/1
wget -O ${target}/densenet_onnx/1/model.onnx \
https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx
}

download_pytorch_benchmark
download_tf_benchmark
download_model_repository
5 changes: 2 additions & 3 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,14 @@ function run() {
sudo nerdctl ps -a | awk 'NR>1 {print $1}' | xargs sudo nerdctl rm >/dev/null 2>&1
sudo nerdctl container prune -f
sudo nerdctl image prune -f --all
sudo systemctl restart nydus-snapshotter
sleep 1

echo "[INFO] Run hello bench in ${image} ..."
sudo nerdctl --snapshotter overlayfs rmi -f ${TARGET_REGISTRY}/${image} >/dev/null 2>&1
result=$(sudo ./hello.py --bench-config=${BENCH_CONFIG} --engine nerdctl --snapshotter overlayfs --op run \
--registry=${TARGET_REGISTRY} \
--images ${image} |
grep "repo")
grep "repo" | grep "bench" | grep "timestamp")
echo ${result}
echo ${result} >>${RESULT_DIR}/${RESULT_FILE}.${CURRENT_ROUND}
echo "[INFO] Remove image ${TARGET_REGISTRY}/${image} ..."
Expand All @@ -148,7 +147,7 @@ function run() {
result=$(sudo ./hello.py --bench-config=${BENCH_CONFIG} --engine nerdctl --snapshotter nydus --op run \
--registry=${TARGET_REGISTRY} \
--images ${name}:${tag}-nydusv6 |
grep "repo")
grep "repo" | grep "bench" | grep "timestamp")
echo ${result}
echo ${result} >>${RESULT_DIR}/${RESULT_FILE}.${CURRENT_ROUND}
echo "[INFO] Remove image ${TARGET_REGISTRY}/${name}:${tag}-nydusv6 ..."
Expand Down

0 comments on commit 677ae05

Please sign in to comment.