Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AUTOGENERATED] [release/2.4] Cherry-pick PR-1666 #1784

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions aten/src/ATen/native/Normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
#include <c10/core/SymIntArrayRef.h>
#include <utility>
#include <vector>
#include <iostream>

static const int MIOPEN_DIM_MAX = 5;

Expand Down Expand Up @@ -484,6 +485,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
return std::make_tuple(grad_input, grad_weight, grad_bias);
}

<<<<<<< HEAD
BatchNormBackend _select_batch_norm_backend(
const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean,
const Tensor& running_var, bool training, double eps) {
Expand Down Expand Up @@ -531,6 +533,9 @@ BatchNormBackend _select_batch_norm_backend(
return BatchNormBackend::Native;
}

=======
bool PYTORCH_MIOPEN_EXTRA_LOGGING = c10::utils::check_env("PYTORCH_MIOPEN_EXTRA_LOGGING").value_or(false);
>>>>>>> 7870ca9495 ([release/2.3] Enable bf16 with fp32 weights for MIOpen batchnorm (#1666))

// _batch_norm_impl_index(_backward) are used in the JIT be able to keep the run-time selection
// of backends, while enabling it to keep the information about the used backend, so that it can
Expand All @@ -541,6 +546,20 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
const Tensor& input, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& bias_opt /* optional */, const std::optional<Tensor>& running_mean_opt /* optional */, const std::optional<Tensor>& running_var_opt /* optional */,
bool training, double momentum, double eps, bool cudnn_enabled) {
// See [Note: hacky wrapper removal for optional tensor]
if (PYTORCH_MIOPEN_EXTRA_LOGGING)
std :: cout
<< "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index"
<< " input=" << input.scalar_type()
<< " weight=" << (weight_opt.has_value() ? weight_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " bias=" << (bias_opt.has_value() ? bias_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " running_mean=" << (running_mean_opt.has_value() ? running_mean_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " running_var=" << (running_var_opt.has_value() ? running_var_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " training=" << training
// << " momentum=" << momentum
// << " eps=" << eps
<< " cudnn_enabled=" << cudnn_enabled
<< std::endl;

c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
const Tensor& weight = *weight_maybe_owned;
const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
Expand Down Expand Up @@ -600,7 +619,40 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(

Tensor reserve = at::empty({0}, input.options().dtype(kByte));

<<<<<<< HEAD
if (backend == BatchNormBackend::Miopen) {
=======
bool use_miopen = (input.is_cuda()
&& input.dim() <= MIOPEN_DIM_MAX
&& input.scalar_type() != at::kDouble
&& (weight.scalar_type() != at::kHalf)
&& (weight.scalar_type() != at::kBFloat16)
&& weight.defined() && bias.defined()
&& ((running_mean.defined() && running_var.defined())
|| (!running_mean.defined() && !running_var.defined() && training))
&& detail::getCUDAHooks().compiledWithMIOpen()
&& cudnn_enabled
);

if (PYTORCH_MIOPEN_EXTRA_LOGGING)
std::cout
<< "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (use_miopen)"
<< " use_miopen=" << use_miopen
<< " cudnn_enabled=" << cudnn_enabled
<< " dim=" << input.dim()
<< " memory_format=" << input.suggest_memory_format()
<< " input.dtype=" << input.scalar_type()
<< " weight.dtype=" << (weight.defined()?"+":"-") << weight.scalar_type()
<< " bias.dtype=" << (bias.defined()?"+":"-") << bias.scalar_type()
<< " running_mean.dtype=" << (running_mean.defined()?"+":"-") << running_mean.scalar_type()
<< " running_var.dtype=" << (running_mean.defined()?"+":"-") << running_mean.scalar_type()
<< " training=" << training
<< std::endl;

if (use_miopen && input.suggest_memory_format() != MemoryFormat::ChannelsLast && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d) {
if (PYTORCH_MIOPEN_EXTRA_LOGGING)
std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling miopen_batch_norm)" << std::endl;
>>>>>>> 7870ca9495 ([release/2.3] Enable bf16 with fp32 weights for MIOpen batchnorm (#1666))
return std::tuple_cat(
at::miopen_batch_norm(
input.contiguous(), weight.contiguous(), bias.contiguous(),
Expand All @@ -623,6 +675,8 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
const Tensor& input, const Tensor& grad_output, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& running_mean_opt /* optional */, const std::optional<Tensor>& running_var_opt /* optional */, const std::optional<Tensor>& save_mean_opt /* optional */, const std::optional<Tensor>& save_var_transform_opt /* optional */,
bool train, double epsilon, std::array<bool, 3> output_mask, const Tensor &reservedSpace) {
// See [Note: hacky wrapper removal for optional tensor]
if (PYTORCH_MIOPEN_EXTRA_LOGGING)
std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward" << std::endl;
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
const Tensor& weight = *weight_maybe_owned;
const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
Expand Down Expand Up @@ -653,12 +707,16 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(

// backward in inference mode is not supported in cudnn, fallback to native
if (impl_index == 0 || (!train)) {
if (PYTORCH_MIOPEN_EXTRA_LOGGING)
std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling native_batch_norm_backward)" << std::endl;
return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask);
} else if (impl_index == 1) {
// TODO: _batch_norm_impl_index_backward is only used in JIT. cudnn NHWC
// format conversion is done inside cudnn_batch_norm_backward instead
return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon, reservedSpace);
} else if (impl_index == 2) {
if (PYTORCH_MIOPEN_EXTRA_LOGGING)
std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling miopen_batch_norm_backward)" << std::endl;
return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon);
}
TORCH_INTERNAL_ASSERT(false, "Unsupported impl_index in _batch_norm_impl_index_backward: ", impl_index);
Expand All @@ -669,6 +727,20 @@ Tensor batch_norm(
const Tensor& input, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
const std::optional<Tensor>& running_mean_opt, const std::optional<Tensor>& running_var_opt,
bool training, double momentum, double eps, bool cudnn_enabled) {
if (PYTORCH_MIOPEN_EXTRA_LOGGING)
std :: cout
<< "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* batch_norm"
<< " input=" << input.scalar_type()
<< " weight=" << (weight_opt.has_value() ? weight_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " bias=" << (bias_opt.has_value() ? bias_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " running_mean=" << (running_mean_opt.has_value() ? running_mean_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " running_var=" << (running_var_opt.has_value() ? running_var_opt.value().scalar_type() : at::ScalarType::Undefined)
<< " training=" << training
// << " momentum=" << momentum
// << " eps=" << eps
<< " cudnn_enabled=" << cudnn_enabled
<< std::endl;

const Tensor& weight = c10::value_or_else(weight_opt, [] {return Tensor();});
const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
checkAllDefined(c, {running_mean, running_var});
}
checkAllSameGPU(c, {input, weight, bias, running_mean, running_var});
if (input->scalar_type() != ScalarType::Half) {
if (input->scalar_type() != ScalarType::Half && input->scalar_type() != ScalarType::BFloat16) {
checkAllSameType(c, {input, weight});
}
checkAllSameType(c, {weight, bias, running_mean, running_var});
Expand Down Expand Up @@ -186,7 +186,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(

checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var});
if (input->scalar_type() == ScalarType::Half) {
if (input->scalar_type() == ScalarType::Half || input->scalar_type() == ScalarType::BFloat16) {
checkScalarType(c, weight, ScalarType::Float);
} else {
checkAllSameType(c, {input, weight});
Expand Down