Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PyTorch] Adding support for all possible torch.cuda and c10::cuda APIs & Macros #2580

Merged
merged 2 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions clang/lib/DPCT/AnalysisInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -955,8 +955,12 @@ void DpctFileInfo::insertHeader(HeaderType Type, unsigned Offset,
if (auto Iter = FirstIncludeOffset.find(
DpctGlobalInfo::getInstance().getMainFile());
Iter != FirstIncludeOffset.end())
insertHeader("#include \"" + File + +"\"" + getNL(), Iter->second,
InsertPosition::IP_Right);
if (!File.empty() && File[0] == '<')
insertHeader("#include " + File + getNL(), Iter->second,
InsertPosition::IP_Right);
else
insertHeader("#include \"" + File + "\"" + getNL(), Iter->second,
InsertPosition::IP_Right);
}
return;

Expand Down
46 changes: 45 additions & 1 deletion clang/test/dpct/python_migration/case_006/expected.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,58 @@
from torch import xpu

cuda_ver = torch.version.xpu

#init
torch.xpu.init()
xpu.init()
is_init = torch.xpu.is_initialized()
is_init = xpu.is_initialized()

# device APIs
devs = torch.xpu.device_count()
devs = xpu.device_count()

dev = torch.xpu.current_device()
dev = xpu.current_device()

torch.xpu.set_device(dev)
xpu.set_device(dev)

d_props = torch.xpu.get_device_properties(dev)
d_props = xpu.get_device_properties(dev)

curr_d_name = torch.xpu.get_device_name()
curr_d_name = xpu.get_device_name()
d_name = torch.xpu.get_device_name(dev)
d_name = xpu.get_device_name(dev)

d_cap = torch.xpu.get_device_capability()
d_cap = xpu.get_device_capability()
d0_cap = torch.xpu.get_device_capability(devs[0])
d0_cap = xpu.get_device_capability(devs[0])

dev_of_obj = torch.xpu.device_of(obj)
dev_of_obj = xpu.device_of(obj)

arch_list = ['']
arch_list = ['']

cuda_ver = torch.version.xpu
torch.xpu.synchronize()
xpu.synchronize()
torch.xpu.synchronize(dev)
xpu.synchronize(dev)

# stream APIs
curr_st = torch.xpu.current_stream()
curr_st = xpu.current_stream()
curr_d_st = torch.xpu.current_stream(dev)
curr_d_st = xpu.current_stream(dev)

st = torch.xpu.StreamContext(curr_st)
st = xpu.StreamContext(curr_st)

stS = torch.xpu.stream(st)
stS = xpu.stream(st)

torch.xpu.set_stream(st)
xpu.set_stream(st)
46 changes: 45 additions & 1 deletion clang/test/dpct/python_migration/case_006/input.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,58 @@
from torch import cuda

cuda_ver = torch.version.cuda

#init
torch.cuda.init()
cuda.init()
is_init = torch.cuda.is_initialized()
is_init = cuda.is_initialized()

# device APIs
devs = torch.cuda.device_count()
devs = cuda.device_count()

dev = torch.cuda.current_device()
dev = cuda.current_device()

torch.cuda.set_device(dev)
cuda.set_device(dev)

d_props = torch.cuda.get_device_properties(dev)
d_props = cuda.get_device_properties(dev)

curr_d_name = torch.cuda.get_device_name()
curr_d_name = cuda.get_device_name()
d_name = torch.cuda.get_device_name(dev)
d_name = cuda.get_device_name(dev)

d_cap = torch.cuda.get_device_capability()
d_cap = cuda.get_device_capability()
d0_cap = torch.cuda.get_device_capability(devs[0])
d0_cap = cuda.get_device_capability(devs[0])

dev_of_obj = torch.cuda.device_of(obj)
dev_of_obj = cuda.device_of(obj)

arch_list = torch.cuda.get_arch_list()
arch_list = cuda.get_arch_list()

cuda_ver = torch.version.cuda
torch.cuda.synchronize()
cuda.synchronize()
torch.cuda.synchronize(dev)
cuda.synchronize(dev)

# stream APIs
curr_st = torch.cuda.current_stream()
curr_st = cuda.current_stream()
curr_d_st = torch.cuda.current_stream(dev)
curr_d_st = cuda.current_stream(dev)

st = torch.cuda.StreamContext(curr_st)
st = cuda.StreamContext(curr_st)

stS = torch.cuda.stream(st)
stS = cuda.stream(st)

torch.cuda.set_stream(st)
cuda.set_stream(st)
57 changes: 51 additions & 6 deletions clang/test/dpct/pytorch/ATen.cu
Original file line number Diff line number Diff line change
@@ -1,23 +1,68 @@
// RUN: rm -rf %T/pytorch/ATen
// RUN: mkdir -p %T/pytorch/ATen/src
// RUN: cp %S/ATen.cu %T/pytorch/ATen/src/
// RUN: cp %S/user_defined_rule_pytorch.yaml %T/pytorch/ATen/
// RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/ATen/
// RUN: cp -r %S/pytorch_inc %T/pytorch/ATen/
// RUN: cd %T/pytorch/ATen
// RUN: mkdir dpct_out
// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/ATen/user_defined_rule_pytorch.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_cuda_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src
// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_inc" --cuda-include-path="%cuda-path/include" --rule-file=%S/../../../tools/dpct/DpctOptRules/pytorch_api.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src
// RUN: FileCheck --input-file %T/pytorch/ATen/dpct_out/ATen.dp.cpp --match-full-lines %T/pytorch/ATen/src/ATen.cu
// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/pytorch/ATen/dpct_out/ATen.dp.cpp -o %T/pytorch/ATen/dpct_out/ATen.dp.o %}

#ifndef NO_BUILD_TEST
// CHECK: #include <c10/xpu/XPUStream.h>
#include <iostream>
// CHECK: #include <ATen/xpu/XPUContext.h>
#include <ATen/cuda/CUDAContext.h>
// CHECK: #include <ATen/core/Tensor.h>
#include <ATen/core/Tensor.h>

// CHECK: #include <ATen/Tensor.h>
// CHECK-NEXT: #include <c10/util/Half.h>
#include <ATen/cuda/CUDATensorMethods.cuh>

#define AT_CUDA_CHECK(stmt) (stmt)

// CHECK: #define BE_AT_CHECK
#define BE_AT_CHECK AT_CUDA_CHECK


__global__ void kernel() {}

void test_CUDAStream_as_arg() {
dim3 gridSize(2, 2, 1);
dim3 blockSize(8, 8, 1);
void *args[] = {nullptr};

// CHECK: ([&]() {
// CHECK-NEXT: ((sycl::queue *)(c10::xpu::getCurrentXPUStream()))
// CHECK-NEXT: ->parallel_for(sycl::nd_range<3>(gridSize * blockSize, blockSize),
// CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {
// CHECK-NEXT: kernel();
// CHECK-NEXT: });
// CHECK-NEXT: return 0;
// CHECK-NEXT: }());
AT_CUDA_CHECK(cudaLaunchKernel((const void *)kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
}

int main() {
// CHECK: dpct::queue_ptr st =
// CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream());
cudaStream_t st = 0;

// stream APIs
at::DeviceIndex devInd = 1;

// CHECK: auto currentStream = c10::xpu::getCurrentXPUStream();
auto currentStream = at::cuda::getCurrentCUDAStream();
// CHECK: auto deviceStream = c10::xpu::getCurrentXPUStream(devInd);
auto deviceStream = at::cuda::getCurrentCUDAStream(devInd);

// CHECK: dpct::queue_ptr curr_cuda_st =
// CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream().queue());
cudaStream_t curr_cuda_st = at::cuda::getCurrentCUDAStream().stream();
// CHECK: dpct::queue_ptr dev_cuda_st = &static_cast<sycl::queue &>(
// CHECK-NEXT: c10::xpu::getCurrentXPUStream(devInd).queue());
cudaStream_t dev_cuda_st = at::cuda::getCurrentCUDAStream(devInd).stream();

test_CUDAStream_as_arg();

return 0;
}
#endif
57 changes: 45 additions & 12 deletions clang/test/dpct/pytorch/c10.cu
Original file line number Diff line number Diff line change
@@ -1,24 +1,56 @@
// RUN: rm -rf %T/pytorch/c10
// RUN: mkdir -p %T/pytorch/c10/src
// RUN: cp %S/c10.cu %T/pytorch/c10/src/
// RUN: cp %S/user_defined_rule_pytorch.yaml %T/pytorch/c10/
// RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/c10/
// RUN: cp -r %S/pytorch_inc %T/pytorch/c10/
// RUN: cd %T/pytorch/c10
// RUN: mkdir dpct_out
// RUN: dpct -out-root dpct_out %T/pytorch/c10/src/c10.cu --extra-arg="-I%T/pytorch/c10/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/c10/user_defined_rule_pytorch.yaml -- -x cuda --cuda-host-only
// RUN: dpct -out-root dpct_out %T/pytorch/c10/src/c10.cu --extra-arg="-I%T/pytorch/c10/pytorch_inc" --cuda-include-path="%cuda-path/include" --rule-file=%S/../../../tools/dpct/DpctOptRules/pytorch_api.yaml -- -x cuda --cuda-host-only
// RUN: FileCheck --input-file %T/pytorch/c10/dpct_out/c10.dp.cpp --match-full-lines %T/pytorch/c10/src/c10.cu
// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/pytorch/c10/dpct_out/c10.dp.cpp -o %T/pytorch/c10/dpct_out/c10.dp.o %}

#ifndef NO_BUILD_TEST
#include <iostream>
// CHECK: #include <c10/xpu/XPUStream.h>
#include <c10/cuda/CUDAStream.h>
// CHECK: #include <c10/core/DeviceGuard.h>
#include <c10/cuda/CUDAGuard.h>
// CHECK: #include <c10/xpu/XPUStream.h>
#include <c10/cuda/CUDAStream.h>
// CHECK: #include <c10/xpu/XPUFunctions.h>
#include <c10/cuda/CUDAFunctions.h>
// CHECK: #include <c10/xpu/XPUMacros.h>
#include <c10/cuda/CUDAMacros.h>

// CHECK: #define BE_BUILD_SHARED_LIBS C10_XPU_BUILD_SHARED_LIBS
// CHECK-NEXT: #define BE_EXPORT C10_XPU_EXPORT
// CHECK-NEXT: #define BE_IMPORT C10_XPU_IMPORT
// CHECK-NEXT: #define BE_API C10_XPU_API
// CHECK-NEXT: #define BE_BUILD_MAIN_LIB C10_XPU_BUILD_MAIN_LIB
#define BE_BUILD_SHARED_LIBS C10_CUDA_BUILD_SHARED_LIBS
#define BE_EXPORT C10_CUDA_EXPORT
#define BE_IMPORT C10_CUDA_IMPORT
#define BE_API C10_CUDA_API
#define BE_BUILD_MAIN_LIB C10_CUDA_BUILD_MAIN_LIB

int main() {
std::optional<c10::Device> device;
// device APIs
// CHECK: c10::DeviceIndex num_devices = c10::xpu::device_count();
c10::DeviceIndex num_devices = c10::cuda::device_count();

// CHECK: c10::DeviceIndex num_devices_ensured =
// CHECK-NEXT: c10::xpu::device_count_ensure_non_zero();
c10::DeviceIndex num_devices_ensured = c10::cuda::device_count_ensure_non_zero();

// CHECK: c10::DeviceIndex current_device = c10::xpu::current_device();
c10::DeviceIndex current_device = c10::cuda::current_device();

c10::DeviceIndex new_device = 1;
// CHECK: c10::xpu::set_device(new_device);
c10::cuda::set_device(new_device);

// CHECK: c10::DeviceIndex exchanged_device = c10::xpu::exchange_device(0);
c10::DeviceIndex exchanged_device = c10::cuda::ExchangeDevice(0);

// CHECK: c10::DeviceIndex maybe_exchanged_device = c10::xpu::maybe_exchange_device(1);
c10::DeviceIndex maybe_exchanged_device = c10::cuda::MaybeExchangeDevice(1);

std::optional<c10::Device> device;
try {
// CHECK: c10::OptionalDeviceGuard device_guard(device);
c10::cuda::OptionalCUDAGuard device_guard(device);
Expand All @@ -27,18 +59,19 @@ int main() {
return -1;
}

// stream APIs
// CHECK: auto currentStream = c10::xpu::getCurrentXPUStream();
auto currentStream = c10::cuda::getCurrentCUDAStream();

// CHECK: dpct::queue_ptr curr_cuda_st = &(currentStream.queue());
// CHECK-NEXT: curr_cuda_st = &(c10::xpu::getCurrentXPUStream().queue());
// CHECK: dpct::queue_ptr curr_cuda_st =
// CHECK-NEXT: &static_cast<sycl::queue &>(currentStream.queue());
cudaStream_t curr_cuda_st = currentStream.stream();
// CHECK: curr_cuda_st =
// CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream().queue());
curr_cuda_st = c10::cuda::getCurrentCUDAStream().stream();

// CHECK: auto deviceStream = c10::xpu::getCurrentXPUStream(0);
auto deviceStream = c10::cuda::getCurrentCUDAStream(0);

return 0;
}

#endif
Loading