Skip to content

Commit

Permalink
Merge branch 'triton-inference-server:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
CGranger-sorenson authored Apr 30, 2024
2 parents a31e5a2 + b7a0690 commit 1e5f32f
Show file tree
Hide file tree
Showing 9 changed files with 286 additions and 21 deletions.
48 changes: 48 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

FROM nvcr.io/nvidia/tritonserver:24.03-py3

ARG USERNAME=triton-server

RUN apt-get update \
&& apt-get install -y sudo

RUN pip3 install transformers torch

# Create the user
RUN apt-get update \
&& apt-get install -y sudo \
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
&& chmod 0440 /etc/sudoers.d/$USERNAME

RUN pip3 install pre-commit ipdb

RUN mkhomedir_helper triton-server

RUN apt-get install -y cmake rapidjson-dev

USER ${USERNAME}
26 changes: 26 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"name": "Python Backend",

"build": {
"dockerfile": "Dockerfile"
},
"customizations": {
"vscode": {
"extensions": [
"ms-python.vscode-pylance",
"ms-python.python",
"ms-vscode.cpptools-extension-pack",
"ms-vscode.cmake-tools",
"github.vscode-pull-request-github"
]
}
},
"postCreateCommand": "sudo chown -R triton-server:triton-server ~/.cache",

"runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--gpus=all", "--shm-size=2g", "--ulimit", "stack=67108864" ],
"mounts": [
"source=${localEnv:HOME}/.ssh,target=/home/triton-server/.ssh,type=bind,consistency=cached",
"source=${localEnv:HOME}/.cache/huggingface,target=/home/triton-server/.cache/huggingface,type=bind,consistency=cached"
],
"remoteUser": "triton-server"
}
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/build
/.vscode
*.so
builddir

Expand Down
85 changes: 85 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "Configure",
"type": "shell",
"command": "cmake",
"args": [
"-DCMAKE_INSTALL_PREFIX:STRING=/opt/tritonserver/",
"-DTRITON_COMMON_REPO_TAG:STRING=main",
"-DTRITON_BACKEND_REPO_TAG:STRING=main",
"-DTRITON_CORE_REPO_TAG:STRING=main",
"-DTRITON_ENABLE_GPU:STRING=ON",
"-DTRITON_ENABLE_NVTX:STRING=ON",
"-DCMAKE_INSTALL_PREFIX:STRING=${workspaceFolder}/build/install",
"-DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE",
"-DCMAKE_BUILD_TYPE:STRING=Debug",
"-DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc",
"-DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++",
"-S${workspaceFolder}",
"-B${workspaceFolder}/build",
"-G",
"Unix Makefiles"
],
"problemMatcher": []
},
{
"label": "Build",
"type": "shell",
"command": "cmake",
"args": [
"--build",
"/${workspaceFolder}/build",
"--config",
"Debug",
"--target",
"all",
"-j",
"18",
"--"
]
},
{
"label": "Install",
"type": "shell",
"command": "cmake",
"args": [
"--build",
"${workspaceFolder}/build",
"--config",
"Debug",
"--target",
"install",
"-j",
"18",
"--"
]
},
{
"label": "Move",
"type": "shell",
"command": "sudo",
"args": [
"cp",
"-r",
"${workspaceFolder}/build/install/backends/python/*",
"/opt/tritonserver/backends/python"
]
},
{
"label": "Build Python Backend",
"dependsOrder": "sequence",
"dependsOn": [
"Configure",
"Build",
"Install",
"Move"
],
"group": {
"kind": "build",
"isDefault": true
}
}
]
}
33 changes: 30 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ any C++ code.
- [Request Cancellation Handling](#request-cancellation-handling)
- [Decoupled mode](#decoupled-mode)
- [Use Cases](#use-cases)
- [Known Issues](#known-issues)
- [Async Execute](#async-execute)
- [Request Rescheduling](#request-rescheduling)
- [`finalize`](#finalize)
- [Model Config File](#model-config-file)
Expand Down Expand Up @@ -90,6 +90,7 @@ any C++ code.
- [Custom Metrics](#custom-metrics-1)
- [Running with Inferentia](#running-with-inferentia)
- [Logging](#logging)
- [Development with VSCode](#development-with-vscode)
- [Reporting problems, asking questions](#reporting-problems-asking-questions)

## Quick Start
Expand Down Expand Up @@ -620,9 +621,24 @@ full power of what can be achieved from decoupled API. Read
[Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
for more details on how to host a decoupled model.

##### Known Issues
##### Async Execute

* Currently, decoupled Python models can not make async infer requests.
Starting from 24.04, `async def execute(self, requests):` is supported for
decoupled Python models. Its coroutine will be executed by an AsyncIO event loop
shared with requests executing in the same model instance. The next request for
the model instance can start executing while the current request is waiting.

This is useful for minimizing the number of model instances for models that
spend the majority of its time waiting, given requests can be executed
concurrently by AsyncIO. To take full advantage of the concurrency, it is vital
for the async execute function to not block the event loop from making progress
while it is waiting, i.e. downloading over the network.

Notes:
* The model should not modify the running event loop, as this might cause
unexpected issues.
* The server/backend do not control how many requests are added to the event
loop by a model instance.

#### Request Rescheduling

Expand Down Expand Up @@ -1810,6 +1826,17 @@ def initialize(self, args):
# Should print {'custom_key': {'string_value': 'custom_value'}}
```

# Development with VSCode

The repository includes a `.devcontainer` folder that contains a `Dockerfile`
and `devcontainer.json` file to help you develop the Python backend
using
[Visual Studio Code](https://code.visualstudio.com/docs/devcontainers/containers).

In order to build the backend, you can execute the "Build Python Backend" task in the
[VSCode tasks](https://code.visualstudio.com/docs/editor/tasks). This will build
the Python backend and install the artifacts in
`/opt/tritonserver/backends/python`.


# Reporting problems, asking questions
Expand Down
104 changes: 88 additions & 16 deletions src/pb_stub.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,32 @@ PyDefaultArgumentToMutableType(const py::object& argument)
std::string(py::str(argument.get_type())));
}

void
AsyncEventFutureDoneCallback(const py::object& py_future)
{
// TODO: Why using `py_future.result()` with error hangs on exit?
try {
py::object exception = py_future.attr("exception")();
if (!py::isinstance<py::none>(exception)) {
std::string err_msg = "";
py::object traceback = py::module_::import("traceback")
.attr("TracebackException")
.attr("from_exception")(exception)
.attr("format")();
for (py::handle line : traceback) {
err_msg += py::str(line);
}
LOG_ERROR << err_msg;
}
}
catch (const PythonBackendException& pb_exception) {
LOG_ERROR << pb_exception.what();
}
catch (const py::error_already_set& error) {
LOG_ERROR << error.what();
}
}

void
Stub::Instantiate(
int64_t shm_growth_size, int64_t shm_default_size,
Expand Down Expand Up @@ -536,6 +562,8 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle)
c_python_backend_utils.attr("InferenceResponse"));
c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get());

async_event_loop_ = py::none();

py::object TritonPythonModel = sys.attr("TritonPythonModel");
deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor");
serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor");
Expand Down Expand Up @@ -693,11 +721,18 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr)

py::object execute_return =
model_instance_.attr("execute")(py_request_list);
if (!py::isinstance<py::none>(execute_return)) {
throw PythonBackendException(
"Python model '" + name_ +
"' is using the decoupled mode and the execute function must "
"return None.");
bool is_coroutine = py::module::import("asyncio")
.attr("iscoroutine")(execute_return)
.cast<bool>();
if (is_coroutine) {
RunCoroutine(execute_return);
} else {
if (!py::isinstance<py::none>(execute_return)) {
throw PythonBackendException(
"Python model '" + name_ +
"' is using the decoupled mode and the execute function must "
"return None.");
}
}
}
}
Expand Down Expand Up @@ -880,6 +915,35 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
}
}

py::object
Stub::GetAsyncEventLoop()
{
if (py::isinstance<py::none>(async_event_loop_)) {
// Create the event loop if not already.
py::module asyncio = py::module_::import("asyncio");
async_event_loop_ = asyncio.attr("new_event_loop")();
asyncio.attr("set_event_loop")(async_event_loop_);
py::object py_thread =
py::module_::import("threading")
.attr("Thread")(
"target"_a = async_event_loop_.attr("run_forever"),
"daemon"_a = true);
py_thread.attr("start")();
}
return async_event_loop_;
}

void
Stub::RunCoroutine(py::object coroutine)
{
py::object loop = GetAsyncEventLoop();
py::object py_future = py::module_::import("asyncio").attr(
"run_coroutine_threadsafe")(coroutine, loop);
py_future.attr("add_done_callback")(
py::module_::import("c_python_backend_utils")
.attr("async_event_future_done_callback"));
}

void
Stub::UpdateHealth()
{
Expand All @@ -891,13 +955,19 @@ void
Stub::Finalize()
{
finalizing_ = true;
// Call finalize if exists.
if (initialized_ && py::hasattr(model_instance_, "finalize")) {
try {
model_instance_.attr("finalize")();
if (initialized_) {
// Stop async event loop if created.
if (!py::isinstance<py::none>(async_event_loop_)) {
async_event_loop_.attr("stop")();
}
catch (const py::error_already_set& e) {
LOG_INFO << e.what();
// Call finalize if exists.
if (py::hasattr(model_instance_, "finalize")) {
try {
model_instance_.attr("finalize")();
}
catch (const py::error_already_set& e) {
LOG_INFO << e.what();
}
}
}
#ifdef TRITON_ENABLE_GPU
Expand Down Expand Up @@ -953,6 +1023,7 @@ Stub::~Stub()

{
py::gil_scoped_acquire acquire;
async_event_loop_ = py::none();
model_instance_ = py::none();
}
stub_instance_.reset();
Expand Down Expand Up @@ -1739,11 +1810,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
[](std::shared_ptr<InferRequest>& infer_request,
const bool decoupled) {
std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
if (stub->IsDecoupled()) {
throw PythonBackendException(
"Async BLS request execution is not support in the decoupled "
"API.");
}
py::object loop =
py::module_::import("asyncio").attr("get_running_loop")();
py::cpp_function callback = [&stub, infer_request, decoupled]() {
Expand Down Expand Up @@ -1870,6 +1936,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
"is_model_ready", &IsModelReady, py::arg("model_name").none(false),
py::arg("model_version").none(false) = "");

// This function is not part of the public API for Python backend. This is
// only used for internal callbacks.
module.def(
"async_event_future_done_callback", &AsyncEventFutureDoneCallback,
py::arg("py_future").none(false));

// This class is not part of the public API for Python backend. This is only
// used for internal testing purposes.
py::class_<SharedMemoryManager>(module, "SharedMemory")
Expand Down
Loading

0 comments on commit 1e5f32f

Please sign in to comment.