oneapi-src · zhimingwang36 · Jan 8, 2025 · Dec 22, 2024 · Dec 23, 2024
@@ -955,8 +955,12 @@ void DpctFileInfo::insertHeader(HeaderType Type, unsigned Offset,
       if (auto Iter = FirstIncludeOffset.find(
               DpctGlobalInfo::getInstance().getMainFile());
           Iter != FirstIncludeOffset.end())
-        insertHeader("#include \"" + File + +"\"" + getNL(), Iter->second,
-                     InsertPosition::IP_Right);
+        if (!File.empty() && File[0] == '<')
+          insertHeader("#include " + File + getNL(), Iter->second,
+                       InsertPosition::IP_Right);
+        else
+          insertHeader("#include \"" + File + "\"" + getNL(), Iter->second,
+                       InsertPosition::IP_Right);
     }
     return;
 

@@ -1,14 +1,58 @@
 from torch import xpu
 
+cuda_ver = torch.version.xpu
+
+#init
+torch.xpu.init()
+xpu.init()
+is_init = torch.xpu.is_initialized()
+is_init = xpu.is_initialized()
+
+# device APIs
 devs = torch.xpu.device_count()
 devs = xpu.device_count()
 
+dev = torch.xpu.current_device()
+dev = xpu.current_device()
+
+torch.xpu.set_device(dev)
+xpu.set_device(dev)
+
+d_props = torch.xpu.get_device_properties(dev)
+d_props = xpu.get_device_properties(dev)
+
+curr_d_name = torch.xpu.get_device_name()
+curr_d_name = xpu.get_device_name()
+d_name = torch.xpu.get_device_name(dev)
+d_name = xpu.get_device_name(dev)
+
 d_cap = torch.xpu.get_device_capability()
 d_cap = xpu.get_device_capability()
 d0_cap = torch.xpu.get_device_capability(devs[0])
 d0_cap = xpu.get_device_capability(devs[0])
 
+dev_of_obj = torch.xpu.device_of(obj)
+dev_of_obj = xpu.device_of(obj)
+
 arch_list = ['']
 arch_list = ['']
 
-cuda_ver = torch.version.xpu
+torch.xpu.synchronize()
+xpu.synchronize()
+torch.xpu.synchronize(dev)
+xpu.synchronize(dev)
+
+# stream APIs
+curr_st = torch.xpu.current_stream()
+curr_st = xpu.current_stream()
+curr_d_st = torch.xpu.current_stream(dev)
+curr_d_st = xpu.current_stream(dev)
+
+st = torch.xpu.StreamContext(curr_st)
+st = xpu.StreamContext(curr_st)
+
+stS = torch.xpu.stream(st)
+stS = xpu.stream(st)
+
+torch.xpu.set_stream(st)
+xpu.set_stream(st)
@@ -1,14 +1,58 @@
 from torch import cuda
 
+cuda_ver = torch.version.cuda
+
+#init
+torch.cuda.init()
+cuda.init()
+is_init = torch.cuda.is_initialized()
+is_init = cuda.is_initialized()
+
+# device APIs
 devs = torch.cuda.device_count()
 devs = cuda.device_count()
 
+dev = torch.cuda.current_device()
+dev = cuda.current_device()
+
+torch.cuda.set_device(dev)
+cuda.set_device(dev)
+
+d_props = torch.cuda.get_device_properties(dev)
+d_props = cuda.get_device_properties(dev)
+
+curr_d_name = torch.cuda.get_device_name()
+curr_d_name = cuda.get_device_name()
+d_name = torch.cuda.get_device_name(dev)
+d_name = cuda.get_device_name(dev)
+
 d_cap = torch.cuda.get_device_capability()
 d_cap = cuda.get_device_capability()
 d0_cap = torch.cuda.get_device_capability(devs[0])
 d0_cap = cuda.get_device_capability(devs[0])
 
+dev_of_obj = torch.cuda.device_of(obj)
+dev_of_obj = cuda.device_of(obj)
+
 arch_list = torch.cuda.get_arch_list()
 arch_list = cuda.get_arch_list()
 
-cuda_ver = torch.version.cuda
+torch.cuda.synchronize()
+cuda.synchronize()
+torch.cuda.synchronize(dev)
+cuda.synchronize(dev)
+
+# stream APIs
+curr_st = torch.cuda.current_stream()
+curr_st = cuda.current_stream()
+curr_d_st = torch.cuda.current_stream(dev)
+curr_d_st = cuda.current_stream(dev)
+
+st = torch.cuda.StreamContext(curr_st)
+st = cuda.StreamContext(curr_st)
+
+stS = torch.cuda.stream(st)
+stS = cuda.stream(st)
+
+torch.cuda.set_stream(st)
+cuda.set_stream(st)
@@ -1,23 +1,68 @@
 // RUN: rm -rf %T/pytorch/ATen
 // RUN: mkdir -p %T/pytorch/ATen/src
 // RUN: cp %S/ATen.cu %T/pytorch/ATen/src/
-// RUN: cp %S/user_defined_rule_pytorch.yaml %T/pytorch/ATen/
-// RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/ATen/
+// RUN: cp -r %S/pytorch_inc %T/pytorch/ATen/
 // RUN: cd %T/pytorch/ATen
 // RUN: mkdir dpct_out
-// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/ATen/user_defined_rule_pytorch.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_cuda_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src
+// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_inc" --cuda-include-path="%cuda-path/include" --rule-file=%S/../../../tools/dpct/DpctOptRules/pytorch_api.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src
 // RUN: FileCheck --input-file %T/pytorch/ATen/dpct_out/ATen.dp.cpp --match-full-lines %T/pytorch/ATen/src/ATen.cu
-// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST  %T/pytorch/ATen/dpct_out/ATen.dp.cpp -o %T/pytorch/ATen/dpct_out/ATen.dp.o %}
 
-#ifndef NO_BUILD_TEST
+// CHECK: #include <c10/xpu/XPUStream.h>
 #include <iostream>
 // CHECK: #include <ATen/xpu/XPUContext.h>
 #include <ATen/cuda/CUDAContext.h>
 // CHECK: #include <ATen/core/Tensor.h>
 #include <ATen/core/Tensor.h>
 
+// CHECK: #include <ATen/Tensor.h>
+// CHECK-NEXT: #include <c10/util/Half.h>
+#include <ATen/cuda/CUDATensorMethods.cuh>
+
+#define AT_CUDA_CHECK(stmt)  (stmt)
+
+// CHECK: #define BE_AT_CHECK
+#define BE_AT_CHECK AT_CUDA_CHECK
+
+
+__global__ void kernel() {}
+
+void test_CUDAStream_as_arg() {
+  dim3 gridSize(2, 2, 1);
+  dim3 blockSize(8, 8, 1);
+  void *args[] = {nullptr}; 
+
+  // CHECK: ([&]() {
+  // CHECK-NEXT:   ((sycl::queue *)(c10::xpu::getCurrentXPUStream()))
+  // CHECK-NEXT:       ->parallel_for(sycl::nd_range<3>(gridSize * blockSize, blockSize),
+  // CHECK-NEXT:                      [=](sycl::nd_item<3> item_ct1) {
+  // CHECK-NEXT:                        kernel();
+  // CHECK-NEXT:                      });
+  // CHECK-NEXT:   return 0;
+  // CHECK-NEXT: }());
+  AT_CUDA_CHECK(cudaLaunchKernel((const void *)kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
+}
+
 int main() {
+  // CHECK: dpct::queue_ptr st =
+  // CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream());
+  cudaStream_t st = 0;
+
+  // stream APIs
+  at::DeviceIndex devInd = 1;
+
+  // CHECK: auto currentStream = c10::xpu::getCurrentXPUStream();
+  auto currentStream = at::cuda::getCurrentCUDAStream();
+  // CHECK: auto deviceStream = c10::xpu::getCurrentXPUStream(devInd);
+  auto deviceStream = at::cuda::getCurrentCUDAStream(devInd);
+
+  // CHECK: dpct::queue_ptr curr_cuda_st =
+  // CHECK-NEXT:    &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream().queue());
+  cudaStream_t curr_cuda_st = at::cuda::getCurrentCUDAStream().stream();
+  // CHECK: dpct::queue_ptr dev_cuda_st = &static_cast<sycl::queue &>(
+  // CHECK-NEXT:    c10::xpu::getCurrentXPUStream(devInd).queue());
+  cudaStream_t dev_cuda_st = at::cuda::getCurrentCUDAStream(devInd).stream();
+
+  test_CUDAStream_as_arg();
 
   return 0;
 }
-#endif
@@ -1,24 +1,56 @@
 // RUN: rm -rf %T/pytorch/c10
 // RUN: mkdir -p %T/pytorch/c10/src
 // RUN: cp %S/c10.cu %T/pytorch/c10/src/
-// RUN: cp %S/user_defined_rule_pytorch.yaml %T/pytorch/c10/
-// RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/c10/
+// RUN: cp -r %S/pytorch_inc %T/pytorch/c10/
 // RUN: cd %T/pytorch/c10
 // RUN: mkdir dpct_out
-// RUN: dpct -out-root dpct_out %T/pytorch/c10/src/c10.cu --extra-arg="-I%T/pytorch/c10/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/c10/user_defined_rule_pytorch.yaml  -- -x cuda --cuda-host-only
+// RUN: dpct -out-root dpct_out %T/pytorch/c10/src/c10.cu --extra-arg="-I%T/pytorch/c10/pytorch_inc" --cuda-include-path="%cuda-path/include" --rule-file=%S/../../../tools/dpct/DpctOptRules/pytorch_api.yaml  -- -x cuda --cuda-host-only
 // RUN: FileCheck --input-file %T/pytorch/c10/dpct_out/c10.dp.cpp --match-full-lines %T/pytorch/c10/src/c10.cu
-// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST  %T/pytorch/c10/dpct_out/c10.dp.cpp -o %T/pytorch/c10/dpct_out/c10.dp.o %}
 
-#ifndef NO_BUILD_TEST
 #include <iostream>
-// CHECK: #include <c10/xpu/XPUStream.h>
-#include <c10/cuda/CUDAStream.h>
 // CHECK: #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAGuard.h>
+// CHECK: #include <c10/xpu/XPUStream.h>
+#include <c10/cuda/CUDAStream.h>
+// CHECK: #include <c10/xpu/XPUFunctions.h>
+#include <c10/cuda/CUDAFunctions.h>
+// CHECK: #include <c10/xpu/XPUMacros.h>
+#include <c10/cuda/CUDAMacros.h>
+
+// CHECK: #define BE_BUILD_SHARED_LIBS C10_XPU_BUILD_SHARED_LIBS
+// CHECK-NEXT: #define BE_EXPORT C10_XPU_EXPORT
+// CHECK-NEXT: #define BE_IMPORT C10_XPU_IMPORT
+// CHECK-NEXT: #define BE_API C10_XPU_API
+// CHECK-NEXT: #define BE_BUILD_MAIN_LIB C10_XPU_BUILD_MAIN_LIB
+#define BE_BUILD_SHARED_LIBS C10_CUDA_BUILD_SHARED_LIBS
+#define BE_EXPORT C10_CUDA_EXPORT
+#define BE_IMPORT C10_CUDA_IMPORT
+#define BE_API C10_CUDA_API
+#define BE_BUILD_MAIN_LIB C10_CUDA_BUILD_MAIN_LIB
 
 int main() {
-  std::optional<c10::Device> device;
+  // device APIs
+  // CHECK: c10::DeviceIndex num_devices = c10::xpu::device_count();
+  c10::DeviceIndex num_devices = c10::cuda::device_count();
+
+  // CHECK: c10::DeviceIndex num_devices_ensured =
+  // CHECK-NEXT:     c10::xpu::device_count_ensure_non_zero();
+  c10::DeviceIndex num_devices_ensured = c10::cuda::device_count_ensure_non_zero();
 
+  // CHECK: c10::DeviceIndex current_device = c10::xpu::current_device();
+  c10::DeviceIndex current_device = c10::cuda::current_device();
+
+  c10::DeviceIndex new_device = 1;
+  // CHECK: c10::xpu::set_device(new_device);
+  c10::cuda::set_device(new_device);
+
+  // CHECK: c10::DeviceIndex exchanged_device = c10::xpu::exchange_device(0);
+  c10::DeviceIndex exchanged_device = c10::cuda::ExchangeDevice(0);
+
+  // CHECK: c10::DeviceIndex maybe_exchanged_device = c10::xpu::maybe_exchange_device(1);
+  c10::DeviceIndex maybe_exchanged_device = c10::cuda::MaybeExchangeDevice(1);
+
+  std::optional<c10::Device> device;
   try {
     // CHECK: c10::OptionalDeviceGuard device_guard(device);
     c10::cuda::OptionalCUDAGuard device_guard(device);
@@ -27,18 +59,19 @@ int main() {
     return -1;
   }
 
+  // stream APIs
   // CHECK: auto currentStream = c10::xpu::getCurrentXPUStream();
   auto currentStream = c10::cuda::getCurrentCUDAStream();
 
-  // CHECK: dpct::queue_ptr curr_cuda_st = &(currentStream.queue());
-  // CHECK-NEXT: curr_cuda_st = &(c10::xpu::getCurrentXPUStream().queue());
+  // CHECK: dpct::queue_ptr curr_cuda_st =
+  // CHECK-NEXT: &static_cast<sycl::queue &>(currentStream.queue());
   cudaStream_t curr_cuda_st = currentStream.stream();
+  // CHECK: curr_cuda_st =
+  // CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream().queue());
   curr_cuda_st = c10::cuda::getCurrentCUDAStream().stream();
 
   // CHECK: auto deviceStream = c10::xpu::getCurrentXPUStream(0);
   auto deviceStream = c10::cuda::getCurrentCUDAStream(0);
 
   return 0;
 }
-
-#endif