Merge branch 'triton-inference-server:main' into main

CGranger-sorenson · Apr 30, 2024 · 1e5f32f · 1e5f32f
2 parents a31e5a2 + b7a0690
commit 1e5f32f
Show file tree

Hide file tree

Showing 9 changed files with 286 additions and 21 deletions.
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,48 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM nvcr.io/nvidia/tritonserver:24.03-py3
+
+ARG USERNAME=triton-server
+
+RUN apt-get update \
+    && apt-get install -y sudo
+
+RUN pip3 install transformers torch
+
+# Create the user
+RUN apt-get update \
+    && apt-get install -y sudo \
+    && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
+    && chmod 0440 /etc/sudoers.d/$USERNAME
+
+RUN pip3 install pre-commit ipdb
+
+RUN mkhomedir_helper triton-server
+
+RUN apt-get install -y cmake rapidjson-dev
+
+USER ${USERNAME}
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+{
+	"name": "Python Backend",
+
+	"build": {
+		"dockerfile": "Dockerfile"
+	},
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-python.vscode-pylance",
+				"ms-python.python",
+				"ms-vscode.cpptools-extension-pack",
+				"ms-vscode.cmake-tools",
+				"github.vscode-pull-request-github"
+			]
+		}
+	},
+	"postCreateCommand": "sudo chown -R triton-server:triton-server ~/.cache",
+
+	"runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--gpus=all", "--shm-size=2g", "--ulimit", "stack=67108864" ],
+	"mounts": [
+		"source=${localEnv:HOME}/.ssh,target=/home/triton-server/.ssh,type=bind,consistency=cached",
+		"source=${localEnv:HOME}/.cache/huggingface,target=/home/triton-server/.cache/huggingface,type=bind,consistency=cached"
+	],
+	"remoteUser": "triton-server"
+}
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,4 @@
 /build
-/.vscode
 *.so
 builddir
 

diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -0,0 +1,85 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Configure",
+            "type": "shell",
+            "command": "cmake",
+            "args": [
+                "-DCMAKE_INSTALL_PREFIX:STRING=/opt/tritonserver/",
+                "-DTRITON_COMMON_REPO_TAG:STRING=main",
+                "-DTRITON_BACKEND_REPO_TAG:STRING=main",
+                "-DTRITON_CORE_REPO_TAG:STRING=main",
+                "-DTRITON_ENABLE_GPU:STRING=ON",
+                "-DTRITON_ENABLE_NVTX:STRING=ON",
+                "-DCMAKE_INSTALL_PREFIX:STRING=${workspaceFolder}/build/install",
+                "-DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE",
+                "-DCMAKE_BUILD_TYPE:STRING=Debug",
+                "-DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc",
+                "-DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++",
+                "-S${workspaceFolder}",
+                "-B${workspaceFolder}/build",
+                "-G",
+                "Unix Makefiles"
+            ],
+            "problemMatcher": []
+        },
+        {
+            "label": "Build",
+            "type": "shell",
+            "command": "cmake",
+            "args": [
+                "--build",
+                "/${workspaceFolder}/build",
+                "--config",
+                "Debug",
+                "--target",
+                "all",
+                "-j",
+                "18",
+                "--"
+            ]
+        },
+        {
+            "label": "Install",
+            "type": "shell",
+            "command": "cmake",
+            "args": [
+                "--build",
+                "${workspaceFolder}/build",
+                "--config",
+                "Debug",
+                "--target",
+                "install",
+                "-j",
+                "18",
+                "--"
+            ]
+        },
+        {
+            "label": "Move",
+            "type": "shell",
+            "command": "sudo",
+            "args": [
+                "cp",
+                "-r",
+                "${workspaceFolder}/build/install/backends/python/*",
+                "/opt/tritonserver/backends/python"
+            ]
+        },
+        {
+            "label": "Build Python Backend",
+            "dependsOrder": "sequence",
+            "dependsOn": [
+                "Configure",
+                "Build",
+                "Install",
+                "Move"
+            ],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ any C++ code.
       - [Request Cancellation Handling](#request-cancellation-handling)
       - [Decoupled mode](#decoupled-mode)
         - [Use Cases](#use-cases)
-        - [Known Issues](#known-issues)
+        - [Async Execute](#async-execute)
       - [Request Rescheduling](#request-rescheduling)
     - [`finalize`](#finalize)
   - [Model Config File](#model-config-file)
@@ -90,6 +90,7 @@ any C++ code.
   - [Custom Metrics](#custom-metrics-1)
 - [Running with Inferentia](#running-with-inferentia)
 - [Logging](#logging)
+- [Development with VSCode](#development-with-vscode)
 - [Reporting problems, asking questions](#reporting-problems-asking-questions)
 
 ## Quick Start
@@ -620,9 +621,24 @@ full power of what can be achieved from decoupled API. Read
 [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
 for more details on how to host a decoupled model.
 
-##### Known Issues
+##### Async Execute
 
-* Currently, decoupled Python models can not make async infer requests.
+Starting from 24.04, `async def execute(self, requests):` is supported for
+decoupled Python models. Its coroutine will be executed by an AsyncIO event loop
+shared with requests executing in the same model instance. The next request for
+the model instance can start executing while the current request is waiting.
+
+This is useful for minimizing the number of model instances for models that
+spend the majority of its time waiting, given requests can be executed
+concurrently by AsyncIO. To take full advantage of the concurrency, it is vital
+for the async execute function to not block the event loop from making progress
+while it is waiting, i.e. downloading over the network.
+
+Notes:
+* The model should not modify the running event loop, as this might cause
+unexpected issues.
+* The server/backend do not control how many requests are added to the event
+loop by a model instance.
 
 #### Request Rescheduling
 
@@ -1810,6 +1826,17 @@ def initialize(self, args):
     # Should print {'custom_key': {'string_value': 'custom_value'}}
 ```
 
+# Development with VSCode
+
+The repository includes a `.devcontainer` folder that contains a `Dockerfile`
+and `devcontainer.json` file to help you develop the Python backend
+using
+[Visual Studio Code](https://code.visualstudio.com/docs/devcontainers/containers).
+
+In order to build the backend, you can execute the "Build Python Backend" task in the
+[VSCode tasks](https://code.visualstudio.com/docs/editor/tasks). This will build
+the Python backend and install the artifacts in
+`/opt/tritonserver/backends/python`.
 
 
 # Reporting problems, asking questions

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -104,6 +104,32 @@ PyDefaultArgumentToMutableType(const py::object& argument)
       std::string(py::str(argument.get_type())));
 }
 
+void
+AsyncEventFutureDoneCallback(const py::object& py_future)
+{
+  // TODO: Why using `py_future.result()` with error hangs on exit?
+  try {
+    py::object exception = py_future.attr("exception")();
+    if (!py::isinstance<py::none>(exception)) {
+      std::string err_msg = "";
+      py::object traceback = py::module_::import("traceback")
+                                 .attr("TracebackException")
+                                 .attr("from_exception")(exception)
+                                 .attr("format")();
+      for (py::handle line : traceback) {
+        err_msg += py::str(line);
+      }
+      LOG_ERROR << err_msg;
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    LOG_ERROR << pb_exception.what();
+  }
+  catch (const py::error_already_set& error) {
+    LOG_ERROR << error.what();
+  }
+}
+
 void
 Stub::Instantiate(
     int64_t shm_growth_size, int64_t shm_default_size,
@@ -536,6 +562,8 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle)
       c_python_backend_utils.attr("InferenceResponse"));
   c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get());
 
+  async_event_loop_ = py::none();
+
   py::object TritonPythonModel = sys.attr("TritonPythonModel");
   deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor");
   serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor");
@@ -693,11 +721,18 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr)
 
       py::object execute_return =
           model_instance_.attr("execute")(py_request_list);
-      if (!py::isinstance<py::none>(execute_return)) {
-        throw PythonBackendException(
-            "Python model '" + name_ +
-            "' is using the decoupled mode and the execute function must "
-            "return None.");
+      bool is_coroutine = py::module::import("asyncio")
+                              .attr("iscoroutine")(execute_return)
+                              .cast<bool>();
+      if (is_coroutine) {
+        RunCoroutine(execute_return);
+      } else {
+        if (!py::isinstance<py::none>(execute_return)) {
+          throw PythonBackendException(
+              "Python model '" + name_ +
+              "' is using the decoupled mode and the execute function must "
+              "return None.");
+        }
       }
     }
   }
@@ -880,6 +915,35 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
   }
 }
 
+py::object
+Stub::GetAsyncEventLoop()
+{
+  if (py::isinstance<py::none>(async_event_loop_)) {
+    // Create the event loop if not already.
+    py::module asyncio = py::module_::import("asyncio");
+    async_event_loop_ = asyncio.attr("new_event_loop")();
+    asyncio.attr("set_event_loop")(async_event_loop_);
+    py::object py_thread =
+        py::module_::import("threading")
+            .attr("Thread")(
+                "target"_a = async_event_loop_.attr("run_forever"),
+                "daemon"_a = true);
+    py_thread.attr("start")();
+  }
+  return async_event_loop_;
+}
+
+void
+Stub::RunCoroutine(py::object coroutine)
+{
+  py::object loop = GetAsyncEventLoop();
+  py::object py_future = py::module_::import("asyncio").attr(
+      "run_coroutine_threadsafe")(coroutine, loop);
+  py_future.attr("add_done_callback")(
+      py::module_::import("c_python_backend_utils")
+          .attr("async_event_future_done_callback"));
+}
+
 void
 Stub::UpdateHealth()
 {
@@ -891,13 +955,19 @@ void
 Stub::Finalize()
 {
   finalizing_ = true;
-  // Call finalize if exists.
-  if (initialized_ && py::hasattr(model_instance_, "finalize")) {
-    try {
-      model_instance_.attr("finalize")();
+  if (initialized_) {
+    // Stop async event loop if created.
+    if (!py::isinstance<py::none>(async_event_loop_)) {
+      async_event_loop_.attr("stop")();
     }
-    catch (const py::error_already_set& e) {
-      LOG_INFO << e.what();
+    // Call finalize if exists.
+    if (py::hasattr(model_instance_, "finalize")) {
+      try {
+        model_instance_.attr("finalize")();
+      }
+      catch (const py::error_already_set& e) {
+        LOG_INFO << e.what();
+      }
     }
   }
 #ifdef TRITON_ENABLE_GPU
@@ -953,6 +1023,7 @@ Stub::~Stub()
 
   {
     py::gil_scoped_acquire acquire;
+    async_event_loop_ = py::none();
     model_instance_ = py::none();
   }
   stub_instance_.reset();
@@ -1739,11 +1810,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
           [](std::shared_ptr<InferRequest>& infer_request,
              const bool decoupled) {
             std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
-            if (stub->IsDecoupled()) {
-              throw PythonBackendException(
-                  "Async BLS request execution is not support in the decoupled "
-                  "API.");
-            }
             py::object loop =
                 py::module_::import("asyncio").attr("get_running_loop")();
             py::cpp_function callback = [&stub, infer_request, decoupled]() {
@@ -1870,6 +1936,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       "is_model_ready", &IsModelReady, py::arg("model_name").none(false),
       py::arg("model_version").none(false) = "");
 
+  // This function is not part of the public API for Python backend. This is
+  // only used for internal callbacks.
+  module.def(
+      "async_event_future_done_callback", &AsyncEventFutureDoneCallback,
+      py::arg("py_future").none(false));
+
   // This class is not part of the public API for Python backend. This is only
   // used for internal testing purposes.
   py::class_<SharedMemoryManager>(module, "SharedMemory")