Merge pull request #2 from ROCmSoftwarePlatform/amd

Amd RNN
baidu-research · Mar 30, 2018 · e569133 · e569133
2 parents b49d1bb + 3a25b28
commit e569133
Show file tree

Hide file tree

Showing 6 changed files with 413 additions and 78 deletions.
diff --git a/code/amd/Makefile b/code/amd/Makefile
@@ -16,15 +16,20 @@ CONV_PATH?=$
 
 INC_DIR=${SOURCE_DIR}/../kernels
 
-all: conv gemm 
+all: conv rnn gemm 
 
+#OPT=-g -O0 -fsanitize=undefined -fno-omit-frame-pointer
 OPT=-O3
 TARGETS=
 
 conv:
 	$(MKDIR) $(BIN_DIR)
 	$(HIPCC) ${SOURCE_DIR}/conv_bench_rocm.cpp -o $(BIN_DIR)/conv_bench -I$(ROCM_PATH)/include -I$(INC_DIR) -l$(CONV_LIBRARY) $(OPT) -std=c++11 --amdgpu-target=gfx900
 
+rnn:
+	$(MKDIR) $(BIN_DIR)
+	$(HIPCC) ${SOURCE_DIR}/rnn_bench_rocm.cpp -o $(BIN_DIR)/rnn_bench -I$(ROCM_PATH)/include -I$(INC_DIR) -l$(CONV_LIBRARY) $(OPT) -std=c++11 --amdgpu-target=gfx900
+
 gemm:
 	$(MKDIR) $(BIN_DIR)
 	$(HIPCC) ${SOURCE_DIR}/gemm_bench.cpp -o $(BIN_DIR)/gemm_bench -I$(ROCM_PATH)/include -I$(INC_DIR) -l$(ROCBLAS_LIB) $(OPT) -std=c++11 --amdgpu-target=gfx900

diff --git a/code/amd/conv_bench_rocm.cpp b/code/amd/conv_bench_rocm.cpp
@@ -65,7 +65,7 @@ class miopenCNN {
 
         output_dims_ = {out_w, out_h, out_c, out_n};
 
-        h = zeros(output_dims_);
+        h = zeros<float>(output_dims_);
 
 
         // Set fwd workspace size
@@ -79,7 +79,7 @@ class miopenCNN {
 
         std::vector<int> u = std::vector<int>{static_cast<int>(fwd_workspace_size_ / sizeof(float)), 1};
 
-        fwd_workspace_ = zeros(u);
+        fwd_workspace_ = zeros<float>(u);
 
         const int requestAlgoCount = 1;
         int returnedAlgoCount;
@@ -113,7 +113,7 @@ class miopenCNN {
                                                   w_desc_.desc(),
                                                   &bwd_params_workspace_size_));
     u = std::vector<int>{static_cast<int>(bwd_params_workspace_size_ / sizeof(float)), 1};
-    bwd_params_workspace_ = zeros(u);
+    bwd_params_workspace_ = zeros<float>(u);
 
     CHECK_MIOPEN_ERROR(miopenFindConvolutionBackwardWeightsAlgorithm(
       miopen_handle_.handle(),
@@ -143,7 +143,7 @@ class miopenCNN {
                                                   &bwd_inputs_workspace_size_));
 
     u = std::vector<int>{static_cast<int>(bwd_inputs_workspace_size_ / sizeof(float)), 1};
-    bwd_inputs_workspace_ = zeros(u);
+    bwd_inputs_workspace_ = zeros<float>(u);
 
         CHECK_MIOPEN_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
           miopen_handle_.handle(),
@@ -257,10 +257,10 @@ std::tuple<int, int, int, std::string> time_cnn(
 
 
     // Allocate memory for filter
-    auto filter = rand(std::vector<int>{r, s, c, k});
+    auto filter = rand<float>(std::vector<int>{r, s, c, k});
 
     // Allocate memory for input
-    auto input = rand(std::vector<int>{w, h, c, n});
+    auto input = rand<float>(std::vector<int>{w, h, c, n});
     miopenCNN cnn(w, h, c, n, k, r, s, pad_w, pad_h, wstride, hstride, input, filter);
 
     // Allocate memory for output tensor
@@ -283,8 +283,8 @@ std::tuple<int, int, int, std::string> time_cnn(
     int fwd_time = static_cast<int>(std::chrono::duration<double, std::micro>(end - start).count() / num_repeats);
 
     // Allocate memory for backward pass wrt weights
-    auto delta = rand(cnn.get_output_dims());
-    auto dW = zeros(std::vector<int>{r, s, c, k});
+    auto delta = rand<float>(cnn.get_output_dims());
+    auto dW = zeros<float>(std::vector<int>{r, s, c, k});
 
     // Warm up backward
     cnn.backward_params(input, delta, dW);
@@ -303,7 +303,7 @@ std::tuple<int, int, int, std::string> time_cnn(
     int bwd_params_time = static_cast<int>(std::chrono::duration<double, std::micro>(end - start).count() / num_repeats);
 
     //Allocate memory for backward pass wrt inputs
-    auto dX = zeros(std::vector<int>{w, h, c, n});
+    auto dX = zeros<float>(std::vector<int>{w, h, c, n});
 
     //Warm up backward inputs
     cnn.backward_inputs(filter, delta, dX);
@@ -329,7 +329,7 @@ std::tuple<int, int, int, std::string> time_cnn(
 
 int main(int argc, char **argv) {
 
-    int num_repeats = 100;
+    int num_repeats = 300;
 
     hipFree(0);
 
@@ -339,15 +339,15 @@ int main(int argc, char **argv) {
     std::cout << std::setw(30) << "Times" << std::endl;
     std::cout << std::setfill('-') << std::setw(190) << "-" << std::endl;
     std::cout << std::setfill(' ');
-    std::cout << "   w      h      c      n      k      r      s    pad_w  pad_h    stride_w  stride_h    fwd_time (usec)  bwd_inputs_time (usec)  bwd_params_time (usec)  total_time (usec)   fwd_algo " << std::endl;
+    std::cout << "   w      h      c      n      k      f_w      f_h    pad_w  pad_h    stride_w  stride_h    fwd_time (usec)  bwd_inputs_time (usec)  bwd_params_time (usec)  total_time (usec)   fwd_algo " << std::endl;
     std::cout << std::setfill('-') << std::setw(190) << "-" << std::endl;
     std::cout << std::setfill(' ');
 
     int total_fwd_time=0, total_bwd_inputs_time=0, total_bwd_params_time=0;
     for (const auto &problem : training_set) {
 
         // Filter parameters
-        int k, c, r, s;
+        int k, c, r, s; // r - filter_h (f_h), s - filter_w (f_w)
 
         // Input parameters
         int n, w, h;
@@ -371,8 +371,8 @@ int main(int argc, char **argv) {
         std::cout << std::setw(7) << c;
         std::cout << std::setw(7) << n;
         std::cout << std::setw(7) << k;
-        std::cout << std::setw(7) << r;
         std::cout << std::setw(7) << s;
+        std::cout << std::setw(7) << r;
         std::cout << std::setw(7) << pad_w;
         std::cout << std::setw(8) << pad_h;
         std::cout << std::setw(10) << wstride;

diff --git a/code/amd/gemm_bench.cpp b/code/amd/gemm_bench.cpp
@@ -80,9 +80,9 @@ int main(int argc, char **argv) {
         bool a_t, b_t;
         std::tie(m, n, k, a_t, b_t) = problem;
 
-        auto a = rand({a_t ? k : m, a_t ? m : k});
-        auto b = rand({b_t ? n : k, b_t ? k : n});
-        auto c = zeros({m, n});
+        auto a = rand<float>({a_t ? k : m, a_t ? m : k});
+        auto b = rand<float>({b_t ? n : k, b_t ? k : n});
+        auto c = zeros<float>({m, n});
 
         std::cout << std::setw(7) << m;
         std::cout << std::setw(7) << n;

diff --git a/code/amd/miopen_helper.h b/code/amd/miopen_helper.h
@@ -5,6 +5,7 @@
 #include <vector>
 
 #include <miopen/miopen.h>
+#include <half.hpp>
 
 #include "hip_helper.h"
 
@@ -43,43 +44,53 @@ class MIOpenHandle {
 };
 
 template<typename T>
-class TensorDescriptorNd {
+class TensorDescriptor {
     std::shared_ptr<miopenTensorDescriptor_t> desc_;
 
-    struct TensorDescriptorNdDeleter {
+    struct TensorDescriptorDeleter {
         void operator()(miopenTensorDescriptor_t * desc) {
             miopenDestroyTensorDescriptor(*desc);
             delete desc;
         }
     };
 
 public:
+    TensorDescriptor()
+    {
+        miopenTensorDescriptor_t * desc = new miopenTensorDescriptor_t;
+        CHECK_MIOPEN_ERROR(miopenCreateTensorDescriptor(desc));
+
+        desc_.reset(desc, TensorDescriptorDeleter());
+    }
 
-    TensorDescriptorNd(const std::vector<int>& dim,
-                       const std::vector<int>& stride) {
+    TensorDescriptor(std::vector<int> lens,
+                     std::vector<int> strides) {
         miopenDataType_t type;
         if (std::is_same<T, float>::value)
             type = miopenFloat;
+        else if (std::is_same<T, half_float::half>::value)
+            type = miopenHalf;
         else
             throw std::runtime_error("Unknown type");
 
         miopenTensorDescriptor_t * desc = new miopenTensorDescriptor_t;
-
         CHECK_MIOPEN_ERROR(miopenCreateTensorDescriptor(desc));
-        CHECK_MIOPEN_ERROR(miopenSet4dTensorDescriptor(*desc, type, dim[0], dim[1], dim[2], dim[3]));
+        CHECK_MIOPEN_ERROR(miopenSetTensorDescriptor(*desc, type, static_cast<int>(lens.size()), &lens[0], &strides[0]));
 
-        desc_.reset(desc, TensorDescriptorNdDeleter());
+        desc_.reset(desc, TensorDescriptorDeleter());
     }
 
     miopenTensorDescriptor_t desc() const { return *desc_; }
 
 };
 
 template<typename T>
-class TensorDescriptorNdArray {
+class TensorDescriptorArray
+{
     std::shared_ptr<miopenTensorDescriptor_t> desc_array_;
 
-    struct ArrayDeleter {
+    struct ArrayDeleter 
+    {
         int num_;
         ArrayDeleter(int num) : num_(num) {}
 
@@ -92,23 +103,27 @@ class TensorDescriptorNdArray {
         }
     };
 
-    public:
+public:
 
-    TensorDescriptorNdArray(std::vector<int> dim,
-                            std::vector<int> stride,
-                            int num) {
+    TensorDescriptorArray(std::vector<int> lens,
+                            std::vector<int> strides,
+                            int num) 
+    {
         miopenDataType_t type;
         if (std::is_same<T, float>::value)
             type = miopenFloat;
+        else if (std::is_same<T, half_float::half>::value)
+            type = miopenHalf;
         else
             throw std::runtime_error("Unknown type");
 
         miopenTensorDescriptor_t * desc_array = new miopenTensorDescriptor_t[num];
 
-        for (int i = 0; i < num; ++i) {
+        for (int i = 0; i < num; ++i) 
+        {
             CHECK_MIOPEN_ERROR(miopenCreateTensorDescriptor(&desc_array[i]));
-            CHECK_MIOPEN_ERROR(miopenSet4dTensorDescriptor(desc_array[i], type,
-                                          dim[0], dim[1], dim[2], dim[3]));
+            CHECK_MIOPEN_ERROR(miopenSetTensorDescriptor(desc_array[i], type, lens.size(),
+                                          &lens[0], &strides[0]) );
         }
 
         desc_array_.reset(desc_array, ArrayDeleter(num));
@@ -117,38 +132,6 @@ class TensorDescriptorNdArray {
     miopenTensorDescriptor_t * ptr() const { return desc_array_.get(); }
 };
 
-template<typename T>
-class FilterDescriptorNd {
-    std::shared_ptr<miopenTensorDescriptor_t> desc_;
-
-    struct FilterDescriptorNdDeleter {
-        void operator()(miopenTensorDescriptor_t * desc) {
-            miopenDestroyTensorDescriptor(*desc);
-            delete desc;
-        }
-    };
-
-public:
-
-    FilterDescriptorNd() {}
-
-    FilterDescriptorNd(const std::vector<int> dim) {
-        miopenDataType_t type;
-        if (std::is_same<T, float>::value)
-            type = miopenFloat;
-        else
-            throw std::runtime_error("Unknown type");
-
-        miopenTensorDescriptor_t * desc = new miopenTensorDescriptor_t;
-        CHECK_MIOPEN_ERROR(miopenCreateTensorDescriptor(desc));
-        CHECK_MIOPEN_ERROR(miopenSet4dTensorDescriptor(*desc, type, dim[0], dim[1], dim[2], dim[3]));
-
-        desc_.reset(desc, FilterDescriptorNdDeleter());
-    }
-
-    miopenTensorDescriptor_t desc() { return *desc_; }
-};
-
 template<typename T>
 class TensorDescriptor4d {
     std::shared_ptr<miopenTensorDescriptor_t> desc_;
@@ -167,6 +150,8 @@ class TensorDescriptor4d {
         miopenDataType_t type;
         if (std::is_same<T, float>::value)
             type = miopenFloat;
+        else if (std::is_same<T, half_float::half>::value)
+            type = miopenHalf;
         else
             throw std::runtime_error("Unknown type");
 
@@ -202,6 +187,8 @@ class FilterDescriptor4d {
         miopenDataType_t type;
         if (std::is_same<T, float>::value)
             type = miopenFloat;
+        else if (std::is_same<T, half_float::half>::value)
+            type = miopenHalf;
         else
             throw std::runtime_error("Unknown type");
 
@@ -246,3 +233,41 @@ class ConvolutionDescriptor {
 
 };
 
+class RNNDescriptor {
+    std::shared_ptr<miopenRNNDescriptor_t> desc_;
+
+    struct RNNDescriptorDeleter {
+        void operator()(miopenRNNDescriptor_t * desc) {
+            miopenDestroyRNNDescriptor(*desc);
+            delete desc;
+        }
+    };
+public:
+
+    RNNDescriptor() {}
+
+    RNNDescriptor(const int hsize,
+                  const int nlayers,
+                  miopenRNNInputMode_t inMode,
+                  miopenRNNDirectionMode_t direction,
+                  miopenRNNMode_t rnnMode,
+                  miopenRNNBiasMode_t biasMode,
+                  miopenRNNAlgo_t algo,
+                  miopenDataType_t dataType) :
+        desc_(new miopenRNNDescriptor_t, RNNDescriptorDeleter()) 
+    {
+        CHECK_MIOPEN_ERROR(miopenCreateRNNDescriptor(desc_.get()));
+        CHECK_MIOPEN_ERROR(miopenSetRNNDescriptor(*desc_,
+                                                  hsize,
+                                                  nlayers,
+                                                  inMode,
+                                                  direction,
+                                                  rnnMode,
+                                                  biasMode,
+                                                  algo,
+                                                  dataType));
+    }
+
+    miopenRNNDescriptor_t desc() const { return *desc_; };
+};
+