diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ad9f74f29..e850327df4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -309,11 +309,12 @@ if( MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN_B endif() set(MIOPEN_BACKEND_HIP 1) - find_program(HIP_OC_COMPILER amdclang + find_program(HIP_OC_COMPILER NAMES amdclang clang PATH_SUFFIXES bin PATHS /opt/rocm ${CMAKE_INSTALL_PREFIX} + ENV HIP_PATH ) if(HIP_OC_COMPILER) message(STATUS "OpenCL compiler: ${HIP_OC_COMPILER}") diff --git a/LICENSE.txt b/LICENSE.txt index f46cfc569f..157ccd1ec2 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -19,3 +19,96 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +------------------------------------------------------------------------------ + +The following files + - src/include/miopen/kernel_cache.hpp + - src/kernel_cache.cpp + +are licensed using the MIT license described at the top of this file in +addition to an Apache-2.0 license using the following text: + + +Copyright 2015 Vratis, Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +------------------------------------------------------------------------------ + +driver/mloSoftmaxHost.hpp is available under a BSD-2-Clause license + +src/include/miopen/mlo_internal.hpp is licensed using the MIT described above +and a BSD-2-Clause license + +Both files use the following license text for their BSD license text: + + +Copyright (c)2017 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and +the following disclaimer. +Redistributions in binary form must reproduce the above copyright notice, this list of conditions +and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------ + +The file src/md5.cpp is derived from a public domain implementation. The +original license text is as follows: + +Author: +Alexander Peslyak, better known as Solar Designer + +This software was written by Alexander Peslyak in 2001. No copyright is +claimed, and the software is hereby placed in the public domain. +In case this attempt to disclaim copyright and place the software in the +public domain is deemed null and void, then the software is +Copyright (c) 2001 Alexander Peslyak and it is hereby released to the +general public under the following terms: + +Redistribution and use in source and binary forms, with or without +modification, are permitted. + +There's ABSOLUTELY NO WARRANTY, express or implied. + +(This is a heavily cut-down "BSD license".) + +This differs from Colin Plumb's older public domain implementation in that +no exactly 32-bit integer data type is required (any 32-bit or wider +unsigned integer data type will do), there's no compile-time endianness +configuration, and the function prototypes match OpenSSL's. No code from +Colin Plumb's implementation has been reused; this comment merely compares +the properties of the two independent implementations. + +The primary goals of this implementation are portability and ease of use. +It is meant to be fast, but not as fast as possible. Some known +optimizations are not included to reduce source code size and avoid +compile-time configuration. + diff --git a/docs/conceptual/perfdb.rst b/docs/conceptual/perfdb.rst index 96ce0d74ff..3ecb3c2a5a 100644 --- a/docs/conceptual/perfdb.rst +++ b/docs/conceptual/perfdb.rst @@ -29,7 +29,7 @@ found, they're stored in the User PerfDb. MIOpen then automatically reads and us values. By default, System PerfDb resides within MIOpen's install location, while User PerfDb resides in your -home directory. See :ref:` setting up locations ` for more information. +home directory. See :ref:`setting up locations ` for more information. System PerfDb is not modified during MIOpen installation. diff --git a/docs/how-to/debug-log.rst b/docs/how-to/debug-log.rst index cfd28873ce..56b75dbebb 100644 --- a/docs/how-to/debug-log.rst +++ b/docs/how-to/debug-log.rst @@ -288,7 +288,7 @@ To disable using rocBlas entirely, set the `-DMIOPEN_USE_ROCBLAS=Off` configura MIOpen configuration. You can find more information on logging with rocBLAS in the -:doc:`rocBLAS programmer guide `. +:doc:`rocBLAS programmer guide `. Numerical checking ========================================================== diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 02bcb88622..02496548e8 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -31,4 +31,5 @@ The MIOpen API library is structured as follows: * :doc:`Sum <../doxygen/html/group__sum>` (experimental) * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental) * :doc:`Cat <../doxygen/html/group__cat>` (experimental) - * :doc:`Argmax<./argmax>` (experimental) + * :doc:`SGD <../doxygen/html/group___s_g_d>` (experimental) + * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental) diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 7dcbd7f75c..0deba0c9cc 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -59,7 +59,7 @@ importlib-resources==6.1.3 # via # mpire # rocm-docs-core -jinja2==3.1.3 +jinja2==3.1.4 # via # myst-parser # sphinx @@ -159,7 +159,7 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -tqdm==4.66.2 +tqdm==4.66.3 # via mpire typing-extensions==4.10.0 # via diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 224e550fed..8f19a90eb6 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -30,7 +30,8 @@ add_executable(MIOpenDriver InputFlags.cpp conv_common.cpp dm_activ.cpp - dm_argmax.cpp + dm_adam.cpp + dm_addlayernorm.cpp dm_bnorm.cpp dm_cat.cpp dm_conv.cpp @@ -47,9 +48,11 @@ add_executable(MIOpenDriver dm_lrn.cpp dm_pool.cpp dm_reduce.cpp + dm_reduceextreme.cpp dm_rnn.cpp dm_softmax.cpp dm_sum.cpp + dm_t5layernorm.cpp dm_tensorop.cpp main.cpp registry_driver_maker.cpp diff --git a/driver/adam_driver.hpp b/driver/adam_driver.hpp new file mode 100644 index 0000000000..6d54d6af0b --- /dev/null +++ b/driver/adam_driver.hpp @@ -0,0 +1,584 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_ADAM_DRIVER_HPP +#define GUARD_MIOPEN_ADAM_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include "../test/verify.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef MLO_ADAMHOST_H_ +#define MLO_ADAMHOST_H_ + +template +void mloAdamRunHost(miopenTensorDescriptor_t paramDesc, + Tref* params, + Tref* grads, + Tref* exp_avgs, + Tref* exp_avg_sqs, + Tref* max_exp_avg_sqs, + int32_t step, + float lr, + float beta1, + float beta2, + float weight_decay, + float eps, + bool amsgrad, + bool maximize, + bool is_amp, + int32_t grad_scale, + bool found_inf) +{ + if(is_amp && found_inf) + return; + + size_t numel = miopen::deref(paramDesc).GetElementSize(); + for(int i = 0; i < numel; i++) + { + Tref exp_avg = exp_avgs[i]; + Tref exp_avg_sq = exp_avg_sqs[i]; + + Tref param = params[i]; + Tref grad = grads[i]; + if(maximize) + grad *= -1; + if(is_amp) + grad /= grad_scale; + + float bias_correction1 = 1 - pow(beta1, step); + float bias_correction2 = 1 - pow(beta2, step); + + if(weight_decay != 0) + grad += param * weight_decay; + + exp_avg = exp_avg * beta1 + grad * (1 - beta1); + exp_avg_sq = exp_avg_sq * beta2 + grad * grad * (1 - beta2); + + float denom; + if(amsgrad) + { + Tref max_exp_avg_sq = max_exp_avg_sqs[i]; + if(exp_avg_sq > max_exp_avg_sq) + { + max_exp_avg_sq = exp_avg_sq; + max_exp_avg_sqs[i] = max_exp_avg_sq; + } + + denom = sqrt(max_exp_avg_sq) / sqrt(bias_correction2) + eps; + } + else + { + denom = sqrt(exp_avg_sq) / sqrt(bias_correction2) + eps; + } + + params[i] = param - (lr / bias_correction1) * exp_avg / denom; + } +} + +#endif + +template +class AdamDriver : public Driver +{ +public: + AdamDriver() : Driver() + { + miopenCreateTensorDescriptor(¶mDesc); + miopenCreateTensorDescriptor(&gradDesc); + miopenCreateTensorDescriptor(&expAvgDesc); + miopenCreateTensorDescriptor(&expAvgSqDesc); + miopenCreateTensorDescriptor(¶mOutDesc); + miopenCreateTensorDescriptor(&dummyOutDesc); + if(is_amp) + { + miopenCreateTensorDescriptor(&stepDesc); + miopenCreateTensorDescriptor(&gradScaleDesc); + miopenCreateTensorDescriptor(&foundInfDesc); + } + + data_type = miopen_type{}; + grad_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetInputTensorLengthsFromCmdLine(); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~AdamDriver() override + { + miopenDestroyTensorDescriptor(paramDesc); + miopenDestroyTensorDescriptor(gradDesc); + miopenDestroyTensorDescriptor(expAvgDesc); + miopenDestroyTensorDescriptor(expAvgSqDesc); + miopenDestroyTensorDescriptor(paramOutDesc); + miopenDestroyTensorDescriptor(dummyOutDesc); + if(maxExpAvgSqDesc) + miopenDestroyTensorDescriptor(maxExpAvgSqDesc); + if(stepDesc) + miopenDestroyTensorDescriptor(stepDesc); + if(gradScaleDesc) + miopenDestroyTensorDescriptor(gradScaleDesc); + if(stepDesc) + miopenDestroyTensorDescriptor(foundInfDesc); + } + +private: + InputFlags inflags; + + int forw = 1; + + miopenTensorDescriptor_t paramDesc = nullptr; + miopenTensorDescriptor_t gradDesc = nullptr; + miopenTensorDescriptor_t expAvgDesc = nullptr; + miopenTensorDescriptor_t expAvgSqDesc = nullptr; + miopenTensorDescriptor_t maxExpAvgSqDesc = nullptr; + miopenTensorDescriptor_t stepDesc = nullptr; + miopenTensorDescriptor_t gradScaleDesc = nullptr; + miopenTensorDescriptor_t foundInfDesc = nullptr; + miopenTensorDescriptor_t paramOutDesc = nullptr; + miopenTensorDescriptor_t dummyOutDesc = nullptr; + + std::unique_ptr param_dev; + std::unique_ptr param_out_dev; + std::unique_ptr dummy_out_dev; + std::unique_ptr grad_dev; + std::unique_ptr exp_avg_dev; + std::unique_ptr exp_avg_sq_dev; + std::unique_ptr max_exp_avg_sq_dev; + std::unique_ptr step_dev; + std::unique_ptr scale_dev; + std::unique_ptr found_inf_dev; + + std::vector param; + std::vector grad; + std::vector exp_avg; + std::vector exp_avg_sq; + std::vector max_exp_avg_sq; + + std::vector param_host; + std::vector grad_host; + std::vector exp_avg_host; + std::vector exp_avg_sq_host; + std::vector max_exp_avg_sq_host; + + float lr; + float beta1; + float beta2; + float weight_decay; + float eps; + bool amsgrad = false; + bool maximize = false; + bool found_inf = false; + int grad_scale = 1; + int iter = 0; + + miopenDataType_t grad_type; +}; + +template +int AdamDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int AdamDriver::GetandSetData() +{ + auto param_len = GetInputTensorLengthsFromCmdLine(); + lr = inflags.GetValueDouble("lr"); + beta1 = inflags.GetValueDouble("beta1"); + beta2 = inflags.GetValueDouble("beta2"); + eps = inflags.GetValueDouble("eps"); + weight_decay = inflags.GetValueDouble("weight_decay"); + amsgrad = inflags.GetValueInt("amsgrad"); + maximize = inflags.GetValueInt("maximize"); + iter = inflags.GetValueInt("iter"); + + if(is_amp) + { + grad_scale = inflags.GetValueInt("scale"); + found_inf = inflags.GetValueInt("found_inf"); + } + + std::vector one_size = {1}; + SetTensorNd(paramDesc, param_len, data_type); + SetTensorNd(paramOutDesc, param_len, data_type); + SetTensorNd(gradDesc, param_len, grad_type); + SetTensorNd(expAvgDesc, param_len, data_type); + SetTensorNd(expAvgSqDesc, param_len, data_type); + SetTensorNd(dummyOutDesc, param_len, data_type); + + if(amsgrad) + { + miopenCreateTensorDescriptor(&maxExpAvgSqDesc); + SetTensorNd(maxExpAvgSqDesc, param_len, data_type); + } + + if(is_amp) + { + SetTensorNd(stepDesc, one_size, miopenInt32); + SetTensorNd(gradScaleDesc, one_size, miopenInt32); + SetTensorNd(foundInfDesc, one_size, miopenInt32); + } + + return 0; +} + +template +int AdamDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward GroupNorm (Default=1)", "int"); + inflags.AddTensorFlag("dims", 'd', "64x32x128", "params tensor dims (Default=64x32x128)"); + + inflags.AddInputFlag("lr", 'l', "0.001", "learning rate (Default=0.001)", "float"); + inflags.AddInputFlag("beta1", '1', "0.9", "beta1 (Default=0.9)", "float"); + inflags.AddInputFlag("beta2", '2', "0.999", "beta2 (Default=0.999)", "float"); + inflags.AddInputFlag("eps", 'e', "0.00000001", "eps (Default=0.00000001)", "float"); + inflags.AddInputFlag("weight_decay", 'W', "0", "weight decay (Default=0)", "float"); + inflags.AddInputFlag("amsgrad", 'a', "0", "whether to use the AMSGrad (Default=0)", "int"); + inflags.AddInputFlag("maximize", 'm', "0", "whether to use the maximize (Default=0)", "int"); + + if(is_amp) + { + inflags.AddInputFlag("scale", 's', "65536", "grad scale factor (Default=65536)", "int"); + inflags.AddInputFlag("found_inf", 'f', "0", "found inf in grad (Default=0)", "int"); + } + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +std::vector AdamDriver::GetInputTensorLengthsFromCmdLine() +{ + std::vector ret; + auto tensor = inflags.GetValueTensor("dims"); + if(!tensor.lengths.empty()) + return tensor.lengths; + return ret; +} + +template +int AdamDriver::AllocateBuffersAndCopy() +{ + size_t param_sz = GetTensorSize(paramDesc); + + uint32_t ctx = 0; + param_dev = std::unique_ptr(new GPUMem(ctx, param_sz, sizeof(Tgpu))); + grad_dev = std::unique_ptr(new GPUMem(ctx, param_sz, sizeof(Tgrad))); + exp_avg_dev = std::unique_ptr(new GPUMem(ctx, param_sz, sizeof(Tgpu))); + exp_avg_sq_dev = std::unique_ptr(new GPUMem(ctx, param_sz, sizeof(Tgpu))); + param_out_dev = std::unique_ptr(new GPUMem(ctx, param_sz, sizeof(Tgpu))); + dummy_out_dev = std::unique_ptr(new GPUMem(ctx, param_sz, sizeof(Tgpu))); + + if(amsgrad) + max_exp_avg_sq_dev = std::unique_ptr(new GPUMem(ctx, param_sz, sizeof(Tgpu))); + + if(is_amp) + { + step_dev = std::unique_ptr(new GPUMem(ctx, 1, sizeof(int))); + scale_dev = std::unique_ptr(new GPUMem(ctx, 1, sizeof(int))); + found_inf_dev = std::unique_ptr(new GPUMem(ctx, 1, sizeof(bool))); + } + + param = std::vector(param_sz, static_cast(0)); + grad = std::vector(param_sz, static_cast(0)); + exp_avg = std::vector(param_sz, static_cast(0)); + exp_avg_sq = std::vector(param_sz, static_cast(0)); + + param_host = std::vector(param_sz, static_cast(0)); + grad_host = std::vector(param_sz, static_cast(0)); + exp_avg_host = std::vector(param_sz, static_cast(0)); + exp_avg_sq_host = std::vector(param_sz, static_cast(0)); + + if(amsgrad) + { + max_exp_avg_sq = std::vector(param_sz, static_cast(0)); + max_exp_avg_sq_host = std::vector(param_sz, static_cast(0)); + } + + for(int i = 0; i < param_sz; i++) + { + param[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + grad[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(0.1)); + exp_avg[i] = prng::gen_A_to_B(static_cast(0), static_cast(0.1)); + exp_avg_sq[i] = prng::gen_A_to_B(static_cast(0), static_cast(0.1)); + param_host[i] = param[i]; + exp_avg_host[i] = exp_avg[i]; + exp_avg_sq_host[i] = exp_avg_sq[i]; + + if(amsgrad) + { + max_exp_avg_sq[i] = + prng::gen_A_to_B(static_cast(0.5), static_cast(1.0)); + max_exp_avg_sq_host[i] = max_exp_avg_sq[i]; + } + + if(is_amp) + { + grad[i] *= grad_scale; + if(!found_inf && (std::isnan(grad[i]) || std::isinf(grad[i]))) + { + std::cerr << "Error init (grad), idx: " << i << ", value: " << grad[i] << std::endl; + found_inf = true; + } + } + grad_host[i] = grad[i]; + } + + if(param_dev->ToGPU(GetStream(), param.data()) != 0) + std::cerr << "Error copying (param) to GPU, size: " << param_dev->GetSize() << std::endl; + + if(grad_dev->ToGPU(GetStream(), grad.data()) != 0) + std::cerr << "Error copying (grad) to GPU, size: " << grad_dev->GetSize() << std::endl; + + if(exp_avg_dev->ToGPU(GetStream(), exp_avg.data()) != 0) + std::cerr << "Error copying (exp_avg) to GPU, size: " << exp_avg_dev->GetSize() + << std::endl; + + if(exp_avg_sq_dev->ToGPU(GetStream(), exp_avg_sq.data()) != 0) + std::cerr << "Error copying (exp_avg_sq) to GPU, size: " << exp_avg_sq_dev->GetSize() + << std::endl; + + if(amsgrad) + { + if(max_exp_avg_sq_dev->ToGPU(GetStream(), max_exp_avg_sq.data()) != 0) + std::cerr << "Error copying (max_exp_avg_sq) to GPU, size: " + << max_exp_avg_sq_dev->GetSize() << std::endl; + } + + if(is_amp) + { + int step = 0; + if(step_dev->ToGPU(GetStream(), &step) != 0) + std::cerr << "Error copying (step) to GPU, size: " << step_dev->GetSize() << std::endl; + + if(scale_dev->ToGPU(GetStream(), &grad_scale) != 0) + std::cerr << "Error copying (scale) to GPU, size: " << scale_dev->GetSize() + << std::endl; + if(found_inf_dev->ToGPU(GetStream(), &found_inf) != 0) + std::cerr << "Error copying (found_inf) to GPU, size: " << found_inf_dev->GetSize() + << std::endl; + } + + return miopenStatusSuccess; +} + +template +int AdamDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + void* max_exp_avg_sq_ptr = amsgrad ? max_exp_avg_sq_dev->GetMem() : nullptr; + void* grad_scale_ptr = is_amp ? scale_dev->GetMem() : nullptr; + void* found_inf_ptr = is_amp ? found_inf_dev->GetMem() : nullptr; + void* state_step_ptr = is_amp ? step_dev->GetMem() : nullptr; + + Timer t; + START_TIME + + for(int i = 0; i < iter; i++) + { + miopenFusedAdamWithOutput(GetHandle(), + paramDesc, + param_dev->GetMem(), + paramOutDesc, + param_out_dev->GetMem(), + nullptr, + nullptr, + gradDesc, + grad_dev->GetMem(), + expAvgDesc, + exp_avg_dev->GetMem(), + dummyOutDesc, + dummy_out_dev->GetMem(), + expAvgSqDesc, + exp_avg_sq_dev->GetMem(), + dummyOutDesc, + dummy_out_dev->GetMem(), + maxExpAvgSqDesc, + max_exp_avg_sq_ptr, + dummyOutDesc, + dummy_out_dev->GetMem(), + stepDesc, + state_step_ptr, + stepDesc, + state_step_ptr, + i + 1, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + false, // adamw + gradScaleDesc, + grad_scale_ptr, + foundInfDesc, + found_inf_ptr); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + if(WALL_CLOCK) + printf("Wall-clock Time Forward Adam Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Forward Adam Elapsed: %f ms\n", kernel_average_time); + } + + if(param_out_dev->FromGPU(GetStream(), param.data()) != 0) + std::cerr << "Error copying (param_dev) from GPU, size: " << param_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int AdamDriver::RunForwardCPU() +{ + mloAdamRunHost(paramDesc, + param_host.data(), + grad_host.data(), + exp_avg_host.data(), + exp_avg_sq_host.data(), + max_exp_avg_sq_host.data(), + iter, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + is_amp, + grad_scale, + found_inf); + + return miopenStatusSuccess; +} + +template +int AdamDriver::RunBackwardGPU() +{ + return miopenStatusSuccess; +} + +template +Tref AdamDriver::GetTolerance() +{ + if(data_type == miopenHalf) + { + return 1e-3; + } + else if(data_type == miopenFloat) + { + return 5e-5; + } + else if(data_type == miopenDouble) + { + return 1e-10; + } + else if(data_type == miopenBFloat16) + { + return 5e-3; + } + return 0; +} + +template +int AdamDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(param_host, param); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward Adam FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + + std::cout << "Forward Adam Verifies OK on CPU reference" << std::endl; + + return miopenStatusSuccess; +} + +template +int AdamDriver::VerifyBackward() +{ + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_ADAM_DRIVER_HPP diff --git a/driver/addlayernorm_driver.hpp b/driver/addlayernorm_driver.hpp new file mode 100644 index 0000000000..4741d2d820 --- /dev/null +++ b/driver/addlayernorm_driver.hpp @@ -0,0 +1,511 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP +#define GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> +#include "InputFlags.hpp" +#include "driver.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include +#include +#include +#include +#include +#include +#include + +template +int32_t mloAddLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc, + Tgpu* input, + Tgpu* input2, + Tgpu* weight, + Tgpu* bias, + Tcheck* outputhost, + Tcheck* meanhost, + Tcheck* rstdhost, + float eps, + int32_t normalized_dim, + miopenNormMode_t mode) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + size_t norm_dim = static_cast(normalized_dim); + + for(size_t i = 0ULL; i < dims.size(); ++i) + { + if(i < norm_dim) + outer_size *= dims[i]; + else + inner_size *= dims[i]; + } + + int32_t ret = 0; + + for(int32_t o = 0; o < outer_size; o++) + { + Tcheck pmean = 0.0f; + Tcheck pvar = 0.0f; + for(int32_t i = 0; i < inner_size; i++) + { + Tcheck tmp = static_cast(input[o * inner_size + i]) + + static_cast(input2[o * inner_size + i]); + pmean += tmp; + pvar += tmp * tmp; + } + + pmean = pmean / inner_size; + pvar = pvar / inner_size - pmean * pmean; + Tcheck prstd = 1.0f / sqrt(pvar + eps); + + meanhost[o] = pmean; + rstdhost[o] = prstd; + + for(int32_t i = 0; i < inner_size; i++) + { + Tcheck pweight = + (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? 1 : static_cast(weight[i]); + Tcheck pbias = + (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? 0 : static_cast(bias[i]); + outputhost[o * inner_size + i] = + (static_cast(input[o * inner_size + i]) + + static_cast(input2[o * inner_size + i]) - pmean) * + prstd * pweight + + pbias; + } + } + return ret; +} + +template +class AddLayerNormDriver : public Driver +{ +public: + AddLayerNormDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&input2Desc); + miopenCreateTensorDescriptor(&weightDesc); + miopenCreateTensorDescriptor(&biasDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&meanDesc); + miopenCreateTensorDescriptor(&rstdDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetInputTensorLengthsFromCmdLine(); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~AddLayerNormDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(input2Desc); + miopenDestroyTensorDescriptor(weightDesc); + miopenDestroyTensorDescriptor(biasDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(meanDesc); + miopenDestroyTensorDescriptor(rstdDesc); + } + +private: + InputFlags inflags; + + int forw; + int dim_size; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t input2Desc; + miopenTensorDescriptor_t weightDesc; + miopenTensorDescriptor_t biasDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t meanDesc; + miopenTensorDescriptor_t rstdDesc; + + std::unique_ptr in_dev; + std::unique_ptr in2_dev; + std::unique_ptr weight_dev; + std::unique_ptr bias_dev; + std::unique_ptr out_dev; + std::unique_ptr mean_dev; + std::unique_ptr rstd_dev; + + std::vector in; + std::vector in2; + std::vector weight; + std::vector bias; + std::vector out; + std::vector mean; + std::vector rstd; + std::vector outhost; + std::vector meanhost; + std::vector rstdhost; + + float eps; + int dim; + miopenNormMode_t mode; +}; + +template +int AddLayerNormDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int AddLayerNormDriver::GetandSetData() +{ + auto inTensorParam = inflags.GetValueTensor("input"); + + auto in_len = inTensorParam.lengths; + + dim = inflags.GetValueInt("normalized_dim"); + + MIOPEN_THROW_IF(dim < 0 || static_cast(dim) >= in_len.size(), + "normalized_dim out of range"); + + std::vector inner_len; + if(dim == in_len.size()) + inner_len = {1}; + else + inner_len = {in_len.begin() + dim, in_len.end()}; + + std::vector outer_len; + if(dim == 0) + outer_len = {1}; + else + outer_len = {in_len.begin(), in_len.end() - (in_len.size() - dim)}; + + if(SetTensorNd(inputDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + "."); + + if(SetTensorNd(input2Desc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input2 tensor: " + inflags.GetValueStr("input") + "."); + + if(SetTensorNd(weightDesc, inner_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting weight tensor."); + + if(SetTensorNd(biasDesc, inner_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting bias tensor."); + + if(SetTensorNd(outputDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting doutput tensor."); + + if(SetTensorNd(meanDesc, outer_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting mean tensor."); + + if(SetTensorNd(rstdDesc, outer_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting rstd tensor."); + + eps = static_cast(inflags.GetValueDouble("eps")); + mode = miopenNormMode_t(inflags.GetValueInt("mode")); + + return 0; +} + +template +int AddLayerNormDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AddLayerNorm (Default=1)", "int"); + inflags.AddTensorFlag("input", 'X', "100x3x32x32", "input tensor descriptor"); + + inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double"); + inflags.AddInputFlag("normalized_dim", 'o', "3", "Nomalized Dim (Default=3)", "int"); + inflags.AddInputFlag( + "mode", 'm', "0", "elemwise affine mode (0), weight and bias mode (1) (Default=0)", "int"); + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int AddLayerNormDriver::AllocateBuffersAndCopy() +{ + const Tgpu Tgpu0val = static_cast(0.0); + const Tgpu Tgpu1val = static_cast(1.0); + const Tref Tref0val = static_cast(0.0); + size_t in_sz = GetTensorSize(inputDesc); + size_t in2_sz = GetTensorSize(input2Desc); + size_t weight_sz = GetTensorSize(weightDesc); + size_t bias_sz = GetTensorSize(biasDesc); + size_t out_sz = GetTensorSize(outputDesc); + size_t mean_sz = GetTensorSize(meanDesc); + size_t rstd_sz = GetTensorSize(rstdDesc); + + uint32_t ctx = 0; + + in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + in2_dev = std::unique_ptr(new GPUMem(ctx, in2_sz, sizeof(Tgpu))); + weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); + bias_dev = std::unique_ptr(new GPUMem(ctx, bias_sz, sizeof(Tgpu))); + out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tref))); + rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tref))); + + in = std::vector(in_sz, Tgpu0val); + in2 = std::vector(in2_sz, Tgpu0val); + weight = std::vector(weight_sz, Tgpu0val); + bias = std::vector(bias_sz, Tgpu0val); + out = std::vector(out_sz, Tgpu0val); + mean = std::vector(mean_sz, Tref0val); + rstd = std::vector(rstd_sz, Tref0val); + outhost = std::vector(out_sz, Tref0val); + meanhost = std::vector(mean_sz, Tref0val); + rstdhost = std::vector(rstd_sz, Tref0val); + + for(int i = 0; i < in_sz; i++) + { + in[i] = prng::gen_A_to_B(Tgpu0val, Tgpu1val); + } + + for(int i = 0; i < in2_sz; i++) + { + in2[i] = prng::gen_A_to_B(Tgpu0val, Tgpu1val); + } + + if(in_dev->ToGPU(GetStream(), in.data()) != 0) + std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl; + if(in2_dev->ToGPU(GetStream(), in2.data()) != 0) + std::cerr << "Error copying (in2) to GPU, size: " << in2_dev->GetSize() << std::endl; + + for(int i = 0; i < weight_sz; i++) + { + if(mode == MIOPEN_ELEMENTWISE_AFFINE) + weight[i] = Tgpu1val; + else + weight[i] = prng::gen_A_to_B(Tgpu0val, Tgpu1val); + } + + if(weight_dev->ToGPU(GetStream(), weight.data()) != 0) + std::cerr << "Error copying (weight) to GPU, size: " << weight_dev->GetSize() << std::endl; + + for(int i = 0; i < bias_sz; i++) + { + if(mode == MIOPEN_ELEMENTWISE_AFFINE) + bias[i] = Tgpu0val; + else + bias[i] = prng::gen_A_to_B(Tgpu0val, Tgpu1val); + } + if(bias_dev->ToGPU(GetStream(), bias.data()) != 0) + std::cerr << "Error copying (bias) to GPU, size: " << bias_dev->GetSize() << std::endl; + + if(out_dev->ToGPU(GetStream(), out.data()) != 0) + std::cerr << "Error copying (out) to GPU, size: " << out_dev->GetSize() << std::endl; + + if(mean_dev->ToGPU(GetStream(), mean.data()) != 0) + std::cerr << "Error copying (mean) to GPU, size: " << mean_dev->GetSize() << std::endl; + + if(rstd_dev->ToGPU(GetStream(), rstd.data()) != 0) + std::cerr << "Error copying (rstd) to GPU, size: " << rstd_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int AddLayerNormDriver::RunForwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenAddLayerNormForward(GetHandle(), + mode, + inputDesc, + in_dev->GetMem(), + input2Desc, + in2_dev->GetMem(), + weightDesc, + weight_dev->GetMem(), + biasDesc, + bias_dev->GetMem(), + eps, + dim, + outputDesc, + out_dev->GetMem(), + meanDesc, + mean_dev->GetMem(), + rstdDesc, + rstd_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Forward AddLayerNorm Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Forward AddLayerNorm Elapsed: " << kernel_average_time + << " ms\n"; + } + + if(out_dev->FromGPU(GetStream(), out.data()) != 0) + std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl; + + if(mean_dev->FromGPU(GetStream(), mean.data()) != 0) + std::cerr << "Error copying (mean_dev) from GPU, size: " << mean_dev->GetSize() + << std::endl; + + if(rstd_dev->FromGPU(GetStream(), rstd.data()) != 0) + std::cerr << "Error copying (rstd_dev) from GPU, size: " << rstd_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int AddLayerNormDriver::RunForwardCPU() +{ + mloAddLayerNormForwardRunHost(inputDesc, + in.data(), + in2.data(), + weight.data(), + bias.data(), + outhost.data(), + meanhost.data(), + rstdhost.data(), + eps, + dim, + mode); + + return miopenStatusSuccess; +} + +template +int AddLayerNormDriver::RunBackwardGPU() +{ + return miopenStatusSuccess; +} + +template +Tref AddLayerNormDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int AddLayerNormDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(outhost, out); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward AddLayerNorm FAILED: " << error << " > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward AddLayerNorm Verifies OK on CPU reference (" << error << " < " + << tolerance << ')' << std::endl; + } + + auto meanerror = miopen::rms_range(meanhost, mean); + if(!std::isfinite(meanerror) || meanerror > tolerance) + { + std::cout << "Forward AddLayerNorm mean FAILED: " << meanerror << " > " << tolerance + << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward AddLayerNorm mean Verifies OK on CPU reference (" << meanerror + << " < " << tolerance << ')' << std::endl; + } + + auto rstderror = miopen::rms_range(rstdhost, rstd); + if(!std::isfinite(rstderror) || rstderror > tolerance) + { + std::cout << "Forward AddLayerNorm rstd FAILED: " << rstderror << " > " << tolerance + << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward AddLayerNorm rstd Verifies OK on CPU reference (" << rstderror + << " < " << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +template +int AddLayerNormDriver::VerifyBackward() +{ + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP diff --git a/driver/dm_adam.cpp b/driver/dm_adam.cpp new file mode 100644 index 0000000000..6b25266613 --- /dev/null +++ b/driver/dm_adam.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "adam_driver.hpp" +#include "registry_driver_maker.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "adam") + return new AdamDriver(); + else if(base_arg == "adamfp16") + return new AdamDriver(); + else if(base_arg == "ampadam") + return new AdamDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/dm_addlayernorm.cpp b/driver/dm_addlayernorm.cpp new file mode 100644 index 0000000000..af8046dde5 --- /dev/null +++ b/driver/dm_addlayernorm.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "addlayernorm_driver.hpp" +#include "registry_driver_maker.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "addlayernorm") + return new AddLayerNormDriver(); + if(base_arg == "addlayernormfp16") + return new AddLayerNormDriver(); + if(base_arg == "addlayernormbfp16") + return new AddLayerNormDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/dm_reduceextreme.cpp b/driver/dm_reduceextreme.cpp new file mode 100644 index 0000000000..2a94a5b69d --- /dev/null +++ b/driver/dm_reduceextreme.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "reduceextreme_driver.hpp" +#include "registry_driver_maker.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "reduceextreme") + return new ReduceExtremeDriver(); + if(base_arg == "reduceextremefp16") + return new ReduceExtremeDriver(); + if(base_arg == "reduceextremebfp16") + return new ReduceExtremeDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/dm_t5layernorm.cpp b/driver/dm_t5layernorm.cpp new file mode 100644 index 0000000000..7fded668fd --- /dev/null +++ b/driver/dm_t5layernorm.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "t5layernorm_driver.hpp" +#include "registry_driver_maker.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "t5layernorm") + return new T5LayerNormDriver(); + if(base_arg == "t5layernormfp16") + return new T5LayerNormDriver(); + if(base_arg == "t5layernormbfp16") + return new T5LayerNormDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index 58916fa02e..5bb0a29042 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -173,7 +173,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16]\n"); + "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " + "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -196,9 +197,12 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" && arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && - arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" && - arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && - arg != "catfp16" && arg != "catbfp16" && arg != "--version") + arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && + arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" && + arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" && + arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" && + arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && + arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/layernorm_driver.hpp b/driver/layernorm_driver.hpp index 59b19f3029..ea5b841c08 100644 --- a/driver/layernorm_driver.hpp +++ b/driver/layernorm_driver.hpp @@ -56,10 +56,11 @@ int32_t mloLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc, auto dims = miopen::deref(inputDesc).GetLengths(); size_t outer_size = 1; size_t inner_size = 1; + size_t norm_dim = static_cast(normalized_dim); for(size_t i = 0ULL; i < dims.size(); ++i) { - if(i < normalized_dim) + if(i < norm_dim) outer_size *= dims[i]; else inner_size *= dims[i]; @@ -87,8 +88,9 @@ int32_t mloLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc, for(int32_t i = 0; i < inner_size; i++) { - Tcheck pweight = mode ? static_cast(weight[i]) : 1; - Tcheck pbias = mode ? static_cast(bias[i]) : 0; + Tcheck pweight = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1 : static_cast(weight[i]); + Tcheck pbias = (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0 : static_cast(bias[i]); outputhost[o * inner_size + i] = (static_cast(input[o * inner_size + i]) - pmean) * prstd * pweight + pbias; } @@ -190,10 +192,15 @@ int LayerNormDriver::ParseCmdLineArgs(int argc, char* argv[]) template int LayerNormDriver::GetandSetData() { - std::vector in_len = GetInputTensorLengthsFromCmdLine(); + auto inTensorParam = inflags.GetValueTensor("input"); + + auto in_len = inTensorParam.lengths; dim = inflags.GetValueInt("normalized_dim"); + MIOPEN_THROW_IF(dim < 0 || static_cast(dim) >= in_len.size(), + "normalized_dim out of range"); + std::vector inner_len; if(dim == in_len.size()) inner_len = {1}; @@ -206,12 +213,23 @@ int LayerNormDriver::GetandSetData() else outer_len = {in_len.begin(), in_len.end() - (in_len.size() - dim)}; - SetTensorNd(inputDesc, in_len, data_type); - SetTensorNd(weightDesc, inner_len, data_type); - SetTensorNd(biasDesc, inner_len, data_type); - SetTensorNd(outputDesc, in_len, data_type); - SetTensorNd(meanDesc, outer_len, data_type); - SetTensorNd(rstdDesc, outer_len, data_type); + if(SetTensorNd(inputDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + "."); + + if(SetTensorNd(weightDesc, inner_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting weight tensor."); + + if(SetTensorNd(biasDesc, inner_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting bias tensor."); + + if(SetTensorNd(outputDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting doutput tensor."); + + if(SetTensorNd(meanDesc, outer_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting mean tensor."); + + if(SetTensorNd(rstdDesc, outer_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting rstd tensor."); eps = static_cast(inflags.GetValueDouble("eps")); mode = miopenNormMode_t(inflags.GetValueInt("mode")); @@ -223,11 +241,7 @@ template int LayerNormDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "1", "Run only Forward LayerNorm (Default=1)", "int"); - inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int"); - inflags.AddInputFlag("in_channels", 'c', "3", "Number of Input Channels (Default=3)", "int"); - inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int"); - inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); - inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); + inflags.AddTensorFlag("input", 'X', "100x3x32x32", "input tensor descriptor"); inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double"); inflags.AddInputFlag("normalized_dim", 'o', "3", "Nomalized Dim (Default=3)", "int"); @@ -243,55 +257,18 @@ int LayerNormDriver::AddCmdLineArgs() return miopenStatusSuccess; } -template -std::vector LayerNormDriver::GetInputTensorLengthsFromCmdLine() -{ - int in_n = inflags.GetValueInt("batchsize"); - int in_c = inflags.GetValueInt("in_channels"); - int in_w = inflags.GetValueInt("in_w"); - int in_h = inflags.GetValueInt("in_h"); - int in_d = inflags.GetValueInt("in_d"); - - if((in_n != 0) && (in_c != 0) && (in_d != 0) && (in_h != 0) && (in_w != 0)) - { - dim_size = 5; - return std::vector({in_n, in_c, in_d, in_h, in_w}); - } - else if((in_n != 0) && (in_c != 0) && (in_h != 0) && (in_w != 0)) - { - dim_size = 4; - return std::vector({in_n, in_c, in_h, in_w}); - } - else if((in_n != 0) && (in_c != 0) && (in_w != 0)) - { - dim_size = 3; - return std::vector({in_n, in_c, in_w}); - } - else if((in_n != 0) && (in_w != 0)) - { - dim_size = 2; - return std::vector({in_n, in_w}); - } - else if(in_n != 0) - { - return std::vector({in_n}); - } - else - { - std::cout << "Error Input Tensor Lengths\n" << std::endl; - return std::vector({0}); - } -} - template int LayerNormDriver::AllocateBuffersAndCopy() { - size_t in_sz = GetTensorSize(inputDesc); - size_t weight_sz = GetTensorSize(weightDesc); - size_t bias_sz = GetTensorSize(biasDesc); - size_t out_sz = GetTensorSize(outputDesc); - size_t mean_sz = GetTensorSize(meanDesc); - size_t rstd_sz = GetTensorSize(rstdDesc); + const Tgpu Tgpu0val = static_cast(0.0); + const Tgpu Tgpu1val = static_cast(1.0); + const Tref Tref0ref = static_cast(0.0); + size_t in_sz = GetTensorSize(inputDesc); + size_t weight_sz = GetTensorSize(weightDesc); + size_t bias_sz = GetTensorSize(biasDesc); + size_t out_sz = GetTensorSize(outputDesc); + size_t mean_sz = GetTensorSize(meanDesc); + size_t rstd_sz = GetTensorSize(rstdDesc); uint32_t ctx = 0; @@ -302,19 +279,19 @@ int LayerNormDriver::AllocateBuffersAndCopy() mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tref))); rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tref))); - in = std::vector(in_sz, static_cast(0)); - weight = std::vector(weight_sz, static_cast(0)); - bias = std::vector(bias_sz, static_cast(0)); - out = std::vector(out_sz, static_cast(0)); - mean = std::vector(mean_sz, static_cast(0)); - rstd = std::vector(rstd_sz, static_cast(0)); - outhost = std::vector(out_sz, static_cast(0)); - meanhost = std::vector(mean_sz, static_cast(0)); - rstdhost = std::vector(rstd_sz, static_cast(0)); + in = std::vector(in_sz, Tgpu0val); + weight = std::vector(weight_sz, Tgpu0val); + bias = std::vector(bias_sz, Tgpu0val); + out = std::vector(out_sz, Tgpu0val); + mean = std::vector(mean_sz, Tref0ref); + rstd = std::vector(rstd_sz, Tref0ref); + outhost = std::vector(out_sz, Tref0ref); + meanhost = std::vector(mean_sz, Tref0ref); + rstdhost = std::vector(rstd_sz, Tref0ref); for(int i = 0; i < in_sz; i++) { - in[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + in[i] = prng::gen_A_to_B(Tgpu0val, Tgpu1val); } if(in_dev->ToGPU(GetStream(), in.data()) != 0) @@ -325,7 +302,7 @@ int LayerNormDriver::AllocateBuffersAndCopy() if(mode == MIOPEN_ELEMENTWISE_AFFINE) weight[i] = static_cast(1); else - weight[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + weight[i] = prng::gen_A_to_B(Tgpu0val, Tgpu1val); } if(weight_dev->ToGPU(GetStream(), weight.data()) != 0) @@ -334,9 +311,9 @@ int LayerNormDriver::AllocateBuffersAndCopy() for(int i = 0; i < bias_sz; i++) { if(mode == MIOPEN_ELEMENTWISE_AFFINE) - bias[i] = static_cast(0); + bias[i] = Tgpu0val; else - bias[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + bias[i] = prng::gen_A_to_B(Tgpu0val, Tgpu1val); } if(bias_dev->ToGPU(GetStream(), bias.data()) != 0) std::cerr << "Error copying (bias) to GPU, size: " << bias_dev->GetSize() << std::endl; @@ -479,7 +456,7 @@ int LayerNormDriver::VerifyForward() } else { - std::cout << "Forward LayerNorm mean Verifies OK on CPU reference (" << error << " < " + std::cout << "Forward LayerNorm mean Verifies OK on CPU reference (" << meanerror << " < " << tolerance << ')' << std::endl; } diff --git a/driver/reduceextreme_driver.hpp b/driver/reduceextreme_driver.hpp new file mode 100644 index 0000000000..7f5fbbc301 --- /dev/null +++ b/driver/reduceextreme_driver.hpp @@ -0,0 +1,461 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_REDUCEEXTREME_DRIVER_HPP +#define GUARD_MIOPEN_REDUCEEXTREME_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include "random.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> +#include "../src/kernels/MIOpenReduceExtreme.hpp" + +template +bool compare_equal(T r1, T r2) +{ + return r1 == r2; +} + +template +int32_t mloReduceExtremeForwardRunHost(miopenTensorDescriptor_t xDesc, + miopenTensorDescriptor_t yDesc, + miopenTensorDescriptor_t indiceDesc, + Tgpu* x, + Tcheck* yhost, + int32_t* indicehost, + int32_t dim) +{ + auto x_dims = miopen::deref(xDesc).GetLengths(); + std::vector indice_dims; + if(yhost) + indice_dims = miopen::deref(yDesc).GetLengths(); + else + indice_dims = miopen::deref(indiceDesc).GetLengths(); + + int32_t reduce_size = static_cast(x_dims[dim]); + auto indice_numel = + std::accumulate(indice_dims.begin(), indice_dims.end(), 1LL, std::multiplies()); + + auto inner_size = + std::accumulate(x_dims.begin() + dim + 1, x_dims.end(), 1ULL, std::multiplies()); + + int32_t ret = miopenStatusSuccess; + + for(size_t o = 0; o < indice_numel; ++o) + { + size_t x_idx = (o / inner_size) * inner_size * reduce_size + o % inner_size; + + int32_t extreme_idx = 0; + Tcheck extreme = static_cast(x[x_idx]); + + for(int32_t i = 1; i < reduce_size; ++i) + { + x_idx += inner_size; + Tcheck val = static_cast(x[x_idx]); + reduce_func{}.calculate(extreme, val, extreme_idx, i); + } + indicehost[o] = extreme_idx; + if(yhost) + yhost[o] = extreme; + } + return ret; +} + +template +class ReduceExtremeDriver : public Driver +{ +public: + ReduceExtremeDriver() : Driver() + { + miopenCreateTensorDescriptor(&xDesc); + miopenCreateTensorDescriptor(&yDesc); + miopenCreateTensorDescriptor(&indiceDesc); + + data_type = miopen_type{}; + indice_data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~ReduceExtremeDriver() override + { + miopenDestroyTensorDescriptor(xDesc); + miopenDestroyTensorDescriptor(yDesc); + miopenDestroyTensorDescriptor(indiceDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t xDesc; + miopenTensorDescriptor_t yDesc; + miopenTensorDescriptor_t indiceDesc; + + std::unique_ptr x_dev; + std::unique_ptr indice_dev; + std::unique_ptr y_dev; + + std::vector x; + std::vector y; + std::vector yhost; + std::vector indice; + std::vector indicehost; + + int dim; + miopenReduceExtremeOp_t reduceExtremeOp; + + miopenDataType_t indice_data_type; +}; + +template +int ReduceExtremeDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + + if((static_cast(inflags.GetValueInt("ReduceExtremeOp")) < + ReduceExtremeOp_t::First_) || + (static_cast(inflags.GetValueInt("ReduceExtremeOp")) > + ReduceExtremeOp_t::Last_)) + { + std::cerr << "Error ReduceExtremeOp(1-4)" << std::endl; + return miopenStatusBadParm; + } + + auto inTensorParam = inflags.GetValueTensor("input"); + + if((inflags.GetValueInt("DimToReduce") < 0) || + (inflags.GetValueInt("DimToReduce") > inTensorParam.lengths.size() - 1)) + { + std::cerr << "Error DimToReduce(0-" << inTensorParam.lengths.size() - 1 << ")" << std::endl; + return miopenStatusBadParm; + } + + return miopenStatusSuccess; +} + +template +int ReduceExtremeDriver::GetandSetData() +{ + auto inTensorParam = inflags.GetValueTensor("input"); + auto in_len = inTensorParam.lengths; + + dim = inflags.GetValueInt("DimToReduce"); + reduceExtremeOp = static_cast(inflags.GetValueInt("ReduceExtremeOp")); + + std::vector out_len; + + for(int i = 0; i < in_len.size(); ++i) + { + if(i != dim) + { + out_len.push_back(in_len[i]); + } + } + + if(out_len.empty()) + out_len.push_back(1); + + if(SetTensorNd(xDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing x tensor: " + inflags.GetValueStr("input") + "."); + + if(SetTensorNd(yDesc, out_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting y tensor."); + + if(SetTensorNd(indiceDesc, out_len, indice_data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting indice tensor."); + + return 0; +} + +template +int ReduceExtremeDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward ReduceExtreme (Default=1)", "int"); + inflags.AddTensorFlag("input", 'X', "21x500x375", "input tensor descriptor"); + inflags.AddInputFlag( + "DimToReduce", 'R', "0", "The indice of the dimensions to be reduced(Default=1)", "int"); + inflags.AddInputFlag("ReduceExtremeOp", + 'O', + "1", + "Reduce Extreme Operation Type (check the enum miopenReduceExtremeOp_t in " + "miopen.h) (Default=1 to Find the the minimum index)", + "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int ReduceExtremeDriver::AllocateBuffersAndCopy() +{ + size_t in_sz = GetTensorSize(xDesc); + size_t out_sz = GetTensorSize(yDesc); + + uint32_t ctx = 0; + + x_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + indice_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(int32_t))); + + x = std::vector(in_sz, static_cast(0)); + indice = std::vector(out_sz, static_cast(0)); + indicehost = std::vector(out_sz, static_cast(0)); + + for(int32_t i = 0; i < in_sz; ++i) + { + x[i] = prng::gen_A_to_B(static_cast(-1.0), static_cast(1.0)); + } + + if(x_dev->ToGPU(GetStream(), x.data()) != 0) + { + std::cerr << "Error copying (x) to GPU, size: " << x_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + if(indice_dev->ToGPU(GetStream(), indice.data()) != 0) + { + std::cerr << "Error copying (indice) to GPU, size: " << indice_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + y_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + y = std::vector(out_sz, static_cast(0)); + yhost = std::vector(out_sz, static_cast(0)); + + if(y_dev->ToGPU(GetStream(), y.data()) != 0) + { + std::cerr << "Error copying (y) to GPU, size: " << y_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + } + + return miopenStatusSuccess; +} + +template +int ReduceExtremeDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int32_t i = 0; i < inflags.GetValueInt("iter"); ++i) + { + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + miopenReduceExtremeForward(GetHandle(), + xDesc, + x_dev->GetMem(), + dim, + reduceExtremeOp, + yDesc, + y_dev->GetMem(), + indiceDesc, + indice_dev->GetMem()); + } + else + { + miopenReduceExtremeForward(GetHandle(), + xDesc, + x_dev->GetMem(), + dim, + reduceExtremeOp, + nullptr, + nullptr, + indiceDesc, + indice_dev->GetMem()); + } + + float time = 0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int32_t iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Forward ReduceExtreme Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Forward ReduceExtreme Elapsed: " << kernel_average_time + << " ms" << std::endl; + } + + if(indice_dev->FromGPU(GetStream(), indice.data()) != 0) + { + std::cerr << "Error copying (indice_dev) from GPU, size: " << indice_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + if(y_dev->FromGPU(GetStream(), y.data()) != 0) + { + std::cerr << "Error copying (y_dev) from GPU, size: " << y_dev->GetSize() << std::endl; + return miopenStatusInternalError; + } + } + + return miopenStatusSuccess; +} + +template +int ReduceExtremeDriver::RunForwardCPU() +{ + if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMIN) + { + return mloReduceExtremeForwardRunHost( + xDesc, nullptr, indiceDesc, x.data(), nullptr, indicehost.data(), dim); + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMAX) + { + return mloReduceExtremeForwardRunHost( + xDesc, nullptr, indiceDesc, x.data(), nullptr, indicehost.data(), dim); + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) + { + return mloReduceExtremeForwardRunHost( + xDesc, yDesc, indiceDesc, x.data(), yhost.data(), indicehost.data(), dim); + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) + { + return mloReduceExtremeForwardRunHost( + xDesc, yDesc, indiceDesc, x.data(), yhost.data(), indicehost.data(), dim); + } + + return miopenStatusInternalError; +} + +template +int ReduceExtremeDriver::RunBackwardGPU() +{ + return miopenStatusSuccess; +} + +template +Tref ReduceExtremeDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int ReduceExtremeDriver::VerifyForward() +{ + RunForwardCPU(); + + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(yhost, y); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward ReduceExtreme FAILED: " << error << " > " << tolerance + << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward ReduceExtreme Verifies on CPU (" << error << " < " << tolerance + << ')' << std::endl; + } + } + auto error_idx = miopen::mismatch_idx(indicehost, indice, compare_equal); + + if(error_idx < miopen::range_distance(indicehost)) + { + std::cout << "Forward ReduceExtreme FAILED: Indice does not equal at " << error_idx + << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward ReduceExtreme Incide Verifies on CPU and GPU" << std::endl; + } + + return miopenStatusSuccess; +} + +template +int ReduceExtremeDriver::VerifyBackward() +{ + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_REDUCEEXTREME_DRIVER_HPP diff --git a/driver/sum_driver.hpp b/driver/sum_driver.hpp index 03589e29e6..830b89c1dd 100644 --- a/driver/sum_driver.hpp +++ b/driver/sum_driver.hpp @@ -47,14 +47,14 @@ template int32_t mloSumForwardRunHost(miopenTensorDescriptor_t inputDesc, - miopenTensorDescriptor_t outputDesc, + miopenTensorDescriptor_t yDesc, Tgpu* input, Tcheck* outputhost, int32_t dim, miopenSumNanPropagation_t nanPropagation) { auto input_dims = miopen::deref(inputDesc).GetLengths(); - auto output_dims = miopen::deref(outputDesc).GetLengths(); + auto output_dims = miopen::deref(yDesc).GetLengths(); auto reduce_size = input_dims[dim]; auto output_numel = @@ -96,7 +96,7 @@ class SumDriver : public Driver SumDriver() : Driver() { miopenCreateTensorDescriptor(&inputDesc); - miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&yDesc); data_type = miopen_type{}; } @@ -121,7 +121,7 @@ class SumDriver : public Driver ~SumDriver() override { miopenDestroyTensorDescriptor(inputDesc); - miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(yDesc); } private: @@ -130,7 +130,7 @@ class SumDriver : public Driver int forw; miopenTensorDescriptor_t inputDesc; - miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t yDesc; std::unique_ptr in_dev; std::unique_ptr out_dev; @@ -179,7 +179,7 @@ int SumDriver::GetandSetData() if(out_len.empty()) out_len.push_back(1); - SetTensorNd(outputDesc, out_len, data_type); + SetTensorNd(yDesc, out_len, data_type); nanPropagation = static_cast(inflags.GetValueInt("NanPropagation")); @@ -253,9 +253,9 @@ template int SumDriver::AllocateBuffersAndCopy() { size_t in_sz = GetTensorSize(inputDesc); - size_t out_sz = GetTensorSize(outputDesc); + size_t out_sz = GetTensorSize(yDesc); - miopenGetSumWorkspaceSize(GetHandle(), inputDesc, dim, outputDesc, &ws_sizeInBytes); + miopenGetSumWorkspaceSize(GetHandle(), inputDesc, dim, yDesc, &ws_sizeInBytes); if(ws_sizeInBytes == static_cast(-1)) return miopenStatusAllocFailed; @@ -301,7 +301,7 @@ int SumDriver::RunForwardGPU() inputDesc, in_dev->GetMem(), dim, - outputDesc, + yDesc, out_dev->GetMem()); float time = 0.0; @@ -334,7 +334,7 @@ template int SumDriver::RunForwardCPU() { mloSumForwardRunHost( - inputDesc, outputDesc, in.data(), outhost.data(), dim, nanPropagation); + inputDesc, yDesc, in.data(), outhost.data(), dim, nanPropagation); return miopenStatusSuccess; } diff --git a/driver/t5layernorm_driver.hpp b/driver/t5layernorm_driver.hpp new file mode 100644 index 0000000000..94a4f6b934 --- /dev/null +++ b/driver/t5layernorm_driver.hpp @@ -0,0 +1,633 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP +#define GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> +#include "InputFlags.hpp" +#include "driver.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include +#include +#include +#include +#include +#include +#include + +template +int32_t mloT5LayerNormForwardRunHost(miopenTensorDescriptor_t xDesc, + Tgpu* x, + Tgpu* weight, + Tcheck* yhost, + Tcheck* rstdhost, + float eps, + miopenNormMode_t mode) +{ + auto dims = miopen::deref(xDesc).GetLengths(); + size_t outer_size = 1; + size_t inner_size = dims[dims.size() - 1]; + + for(size_t i = 0ULL; i < dims.size() - 1; ++i) + { + outer_size *= dims[i]; + } + + int32_t ret = 0; + + for(int32_t o = 0; o < outer_size; o++) + { + Tcheck pvar = static_cast(0); + for(int32_t i = 0; i < inner_size; i++) + { + Tcheck tmp = static_cast(x[o * inner_size + i]); + pvar += tmp * tmp; + } + + pvar = pvar / inner_size; + Tcheck prstd = static_cast(1.0) / sqrt(pvar + eps); + + rstdhost[o] = prstd; + + for(int32_t i = 0; i < inner_size; i++) + { + Tcheck pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE_T5) + ? static_cast(1) + : static_cast(weight[i]); + yhost[o * inner_size + i] = + (static_cast(x[o * inner_size + i])) * prstd * pweight; + } + } + return ret; +} + +template +int32_t mloT5LayerNormBackwardRunHost(miopenTensorDescriptor_t dyDesc, + Tgpu* dy, + Tgpu* x, + Tgpu* weight, + Tcheck* rstdhost, + Tcheck* dxhost, + miopenNormMode_t mode) +{ + auto dims = miopen::deref(dyDesc).GetLengths(); + size_t outer_size = 1; + size_t inner_size = dims[dims.size() - 1]; + + for(size_t i = 0ULL; i < dims.size() - 1; ++i) + { + outer_size *= dims[i]; + } + + int32_t ret = 0; + + for(int32_t o = 0; o < outer_size; o++) + { + Tcheck sum = static_cast(0); + for(int32_t i = 0; i < inner_size; i++) + { + Tcheck pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE_T5) + ? static_cast(1) + : static_cast(weight[i]); + Tcheck pdy = dy ? static_cast(dy[o * inner_size + i]) : static_cast(0); + Tcheck px = static_cast(x[o * inner_size + i]); + sum += pdy * px * pweight; + } + + Tcheck ds = sum; + Tcheck s = static_cast(1) / inner_size; + Tcheck prstd = rstdhost[o]; + Tcheck a = ds * prstd * prstd * prstd * s; + + for(int32_t i = 0; i < inner_size; i++) + { + Tcheck pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE_T5) + ? static_cast(1) + : static_cast(weight[i]); + Tcheck pdy = dy ? static_cast(dy[o * inner_size + i]) : static_cast(0); + + Tcheck val = prstd * pdy * pweight - a * static_cast(x[o * inner_size + i]); + dxhost[o * inner_size + i] = static_cast(val); + } + } + return ret; +} + +template +int32_t mloT5LayerNormBackckwardweightRunHost( + miopenTensorDescriptor_t dyDesc, Tgpu* dy, Tgpu* x, Tcheck* rstdhost, Tcheck* dwhost) +{ + auto dims = miopen::deref(dyDesc).GetLengths(); + size_t outer_size = 1; + size_t inner_size = dims[dims.size() - 1]; + + for(size_t i = 0ULL; i < dims.size() - 1; ++i) + { + outer_size *= dims[i]; + } + + int32_t ret = 0; + + for(int32_t o = 0; o < inner_size; o++) + { + Tcheck sum = static_cast(0); + for(uint64_t i = 0; i < outer_size; ++i) + { + Tcheck prstd = static_cast(rstdhost[i]); + Tcheck pdy = dy ? static_cast(dy[i * inner_size + o]) : 0; + Tcheck px = static_cast(x[i * inner_size + o]); + + sum += pdy * px * prstd; + } + + dwhost[o] = sum; + } + return ret; +} + +template +class T5LayerNormDriver : public Driver +{ +public: + T5LayerNormDriver() : Driver() + { + miopenCreateTensorDescriptor(&xDesc); + miopenCreateTensorDescriptor(&weightDesc); + miopenCreateTensorDescriptor(&yDesc); + miopenCreateTensorDescriptor(&rstdDesc); + miopenCreateTensorDescriptor(&dyDesc); + miopenCreateTensorDescriptor(&dxDesc); + miopenCreateTensorDescriptor(&dwDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetInputTensorLengthsFromCmdLine(); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~T5LayerNormDriver() override + { + + miopenDestroyTensorDescriptor(xDesc); + miopenDestroyTensorDescriptor(weightDesc); + miopenDestroyTensorDescriptor(yDesc); + miopenDestroyTensorDescriptor(rstdDesc); + miopenDestroyTensorDescriptor(dyDesc); + miopenDestroyTensorDescriptor(dxDesc); + miopenDestroyTensorDescriptor(dwDesc); + } + +private: + InputFlags inflags; + + int forw; + int dim_size; + + miopenTensorDescriptor_t xDesc; + miopenTensorDescriptor_t weightDesc; + miopenTensorDescriptor_t yDesc; + miopenTensorDescriptor_t rstdDesc; + miopenTensorDescriptor_t dyDesc; + miopenTensorDescriptor_t dxDesc; + miopenTensorDescriptor_t dwDesc; + + std::unique_ptr x_dev; + std::unique_ptr weight_dev; + std::unique_ptr y_dev; + std::unique_ptr rstd_dev; + std::unique_ptr dy_dev; + std::unique_ptr dx_dev; + std::unique_ptr dw_dev; + std::unique_ptr workspace_dev; + + std::vector x; + std::vector weight; + std::vector y; + std::vector rstd; + std::vector yhost; + std::vector rstdhost; + std::vector dy; + std::vector dx; + std::vector dw; + std::vector dxhost; + std::vector dwhost; + + size_t ws_sizeInBytes; + + float eps; + miopenNormMode_t mode; +}; + +template +int T5LayerNormDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int T5LayerNormDriver::GetandSetData() +{ + auto inTensorParam = inflags.GetValueTensor("input"); + + auto in_len = inTensorParam.lengths; + + std::vector inner_len; + + inner_len = {in_len[in_len.size() - 1]}; + + MIOPEN_THROW_IF(inner_len[0] == 0, "Final dimension must be nonzero"); + + std::vector outer_len; + + outer_len = {in_len.begin(), in_len.end() - 1}; + + if(SetTensorNd(xDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + "."); + + if(SetTensorNd(weightDesc, inner_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting weight tensor."); + + if(SetTensorNd(yDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting doutput tensor."); + + if(SetTensorNd(rstdDesc, outer_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting rstd tensor."); + + if(SetTensorNd(dyDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting dy tensor."); + + if(SetTensorNd(dxDesc, in_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting dx tensor."); + + if(SetTensorNd(dwDesc, inner_len, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error setting dw tensor."); + + eps = static_cast(inflags.GetValueDouble("eps")); + mode = miopenNormMode_t(inflags.GetValueInt("mode")); + + return 0; +} + +template +int T5LayerNormDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward T5LayerNorm (Default=1)", "int"); + inflags.AddTensorFlag("input", 'X', "100x3x32x32", "input tensor descriptor"); + + inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double"); + inflags.AddInputFlag( + "mode", 'm', "0", "elemwise affine mode (0), weight mode (1) (Default=0)", "int"); + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int T5LayerNormDriver::AllocateBuffersAndCopy() +{ + const Tgpu Tgpu0val = static_cast(0.0); + const Tgpu Tgpu1val = static_cast(1.0); + const Tgpu Tgpuminus1val = static_cast(-1.0); + const Tref Tref0ref = static_cast(0.0); + size_t x_sz = GetTensorSize(xDesc); + size_t weight_sz = GetTensorSize(weightDesc); + size_t y_sz = GetTensorSize(yDesc); + size_t rstd_sz = GetTensorSize(rstdDesc); + size_t dy_sz = GetTensorSize(dyDesc); + size_t dx_sz = GetTensorSize(dxDesc); + size_t dw_sz = GetTensorSize(dwDesc); + + miopenGetT5LayerNormBackwardWorkspaceSize( + GetHandle(), mode, dyDesc, xDesc, weightDesc, rstdDesc, dxDesc, dwDesc, &ws_sizeInBytes); + if(ws_sizeInBytes == static_cast(-1)) + return miopenStatusAllocFailed; + + uint32_t ctx = 0; + + x_dev = std::unique_ptr(new GPUMem(ctx, x_sz, sizeof(Tgpu))); + weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); + y_dev = std::unique_ptr(new GPUMem(ctx, y_sz, sizeof(Tgpu))); + rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tgpu))); + dy_dev = std::unique_ptr(new GPUMem(ctx, dy_sz, sizeof(Tgpu))); + dx_dev = std::unique_ptr(new GPUMem(ctx, dx_sz, sizeof(Tgpu))); + dw_dev = std::unique_ptr(new GPUMem(ctx, dw_sz, sizeof(Tgpu))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + + x = std::vector(x_sz, Tgpu0val); + weight = std::vector(weight_sz, Tgpu0val); + y = std::vector(y_sz, Tgpu0val); + rstd = std::vector(rstd_sz, Tgpu0val); + dy = std::vector(dy_sz, Tgpu0val); + dx = std::vector(dx_sz, Tgpu0val); + dw = std::vector(dw_sz, Tgpu0val); + yhost = std::vector(y_sz, Tref0ref); + rstdhost = std::vector(rstd_sz, Tref0ref); + dxhost = std::vector(dx_sz, Tref0ref); + dwhost = std::vector(dw_sz, Tref0ref); + + for(int i = 0; i < x_sz; i++) + { + x[i] = prng::gen_A_to_B(Tgpuminus1val, Tgpu1val); + dy[i] = prng::gen_A_to_B(Tgpuminus1val, Tgpu1val); + } + + if(x_dev->ToGPU(GetStream(), x.data()) != 0) + std::cerr << "Error copying (x) to GPU, size: " << x_dev->GetSize() << std::endl; + if(dy_dev->ToGPU(GetStream(), dy.data()) != 0) + std::cerr << "Error copying (dy) to GPU, size: " << x_dev->GetSize() << std::endl; + + for(int i = 0; i < weight_sz; i++) + { + if(mode == MIOPEN_ELEMENTWISE_AFFINE) + weight[i] = Tgpu1val; + else + weight[i] = prng::gen_A_to_B(Tgpuminus1val, Tgpu1val); + } + + if(weight_dev->ToGPU(GetStream(), weight.data()) != 0) + std::cerr << "Error copying (weight) to GPU, size: " << weight_dev->GetSize() << std::endl; + + if(y_dev->ToGPU(GetStream(), y.data()) != 0) + std::cerr << "Error copying (y) to GPU, size: " << y_dev->GetSize() << std::endl; + + if(rstd_dev->ToGPU(GetStream(), rstd.data()) != 0) + std::cerr << "Error copying (rstd) to GPU, size: " << rstd_dev->GetSize() << std::endl; + + if(dx_dev->ToGPU(GetStream(), dx.data()) != 0) + std::cerr << "Error copying (dx) to GPU, size: " << dx_dev->GetSize() << std::endl; + + if(dw_dev->ToGPU(GetStream(), dw.data()) != 0) + std::cerr << "Error copying (dw) to GPU, size: " << dw_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int T5LayerNormDriver::RunForwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenT5LayerNormForward(GetHandle(), + mode, + xDesc, + x_dev->GetMem(), + weightDesc, + weight_dev->GetMem(), + eps, + yDesc, + y_dev->GetMem(), + rstdDesc, + rstd_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Forward T5LayerNorm Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Forward T5LayerNorm Elapsed: " << kernel_average_time + << " ms\n"; + } + + if(y_dev->FromGPU(GetStream(), y.data()) != 0) + std::cerr << "Error copying (y_dev) from GPU, size: " << y_dev->GetSize() << std::endl; + + if(rstd_dev->FromGPU(GetStream(), rstd.data()) != 0) + std::cerr << "Error copying (rstd_dev) from GPU, size: " << rstd_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int T5LayerNormDriver::RunForwardCPU() +{ + mloT5LayerNormForwardRunHost( + xDesc, x.data(), weight.data(), yhost.data(), rstdhost.data(), eps, mode); + + return miopenStatusSuccess; +} + +template +int T5LayerNormDriver::RunBackwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenT5LayerNormBackward(GetHandle(), + mode, + workspace_dev->GetMem(), + ws_sizeInBytes, + dyDesc, + dy_dev->GetMem(), + xDesc, + x_dev->GetMem(), + weightDesc, + weight_dev->GetMem(), + rstdDesc, + rstd_dev->GetMem(), + dxDesc, + dx_dev->GetMem(), + dwDesc, + dw_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Backward T5LayerNorm Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Backward T5LayerNorm Elapsed: " << kernel_average_time + << " ms\n"; + } + + if(dx_dev->FromGPU(GetStream(), dx.data()) != 0) + std::cerr << "Error copying (dx_dev) from GPU, size: " << dx_dev->GetSize() << std::endl; + + if(dw_dev->FromGPU(GetStream(), dw.data()) != 0) + std::cerr << "Error copying (dw_dev) from GPU, size: " << dw_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int T5LayerNormDriver::RunBackwardCPU() +{ + mloT5LayerNormBackwardRunHost( + dyDesc, dy.data(), x.data(), weight.data(), rstdhost.data(), dxhost.data(), mode); + + mloT5LayerNormBackckwardweightRunHost( + dyDesc, dy.data(), x.data(), rstdhost.data(), dwhost.data()); + + return miopenStatusSuccess; +} + +template +Tref T5LayerNormDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int T5LayerNormDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + + auto error = miopen::rms_range(yhost, y); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward T5LayerNorm FAILED: " << error << " > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward T5LayerNorm Verifies OK on CPU reference (" << error << " < " + << tolerance << ')' << std::endl; + } + + auto rstderror = miopen::rms_range(rstdhost, rstd); + if(!std::isfinite(rstderror) || rstderror > tolerance) + { + std::cout << "Forward T5LayerNorm rstd FAILED: " << rstderror << " > " << tolerance + << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward T5LayerNorm rstd Verifies OK on CPU reference (" << rstderror << " < " + << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +template +int T5LayerNormDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + + auto error = miopen::rms_range(dxhost, dx); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Backward T5LayerNorm FAILED: " << error << " > " << tolerance << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward T5LayerNorm Verifies OK on CPU reference (" << error << " < " + << tolerance << ')' << std::endl; + } + + auto dwerror = miopen::rms_range(dwhost, dw); + if(!std::isfinite(dwerror) || dwerror > tolerance) + { + std::cout << "Backward T5LayerNorm dw FAILED: " << dwerror << " > " << tolerance + << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward T5LayerNorm dw Verifies OK on CPU reference (" << dwerror << " < " + << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 89a3060c9e..eedcab2e12 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -65,9 +65,10 @@ * @defgroup TensorReduce * @defgroup find2 * @defgroup sum - * @defgroup argmax + * @defgroup ReduceExtreme * @defgroup groupnorm * @defgroup cat + * @defgroup SGD * */ @@ -486,6 +487,14 @@ typedef enum MIOPEN_ELEMENTWISE_AFFINE = 0, /*!< initialized to ones for weights and zeros for biases */ MIOPEN_WEIGHT_BIAS = 1, /*!< learnable weights and biases of the module of shape normalized_shape */ + MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD = + 2, /*!< initialized to ones for weights and zeros for biases in addlayernorm */ + MIOPEN_WEIGHT_BIAS_FUSED_ADD = 3, /*!< learnable weights and biases of the module of shape + normalized_shape in addlayernorm */ + MIOPEN_ELEMENTWISE_AFFINE_T5 = + 4, /*!< initialized to ones for weights and zeros for biases in t5layernorm */ + MIOPEN_WEIGHT_BIAS_T5 = 5, /*!< learnable weights and biases of the module of shape + normalized_shape in t5layernorm */ } miopenNormMode_t; #endif /*! @ingroup batchnorm @@ -5780,7 +5789,7 @@ typedef enum * * @param handle MIOpen Handle (input) * @param xDesc Tensor descriptor for data input tensor x (input) - * @param dim Dimensions to sum. (input) + * @param dim Dimension to sum. (input) * @param yDesc Tensor descriptor for output data tensor y (input) * @param sizeInBytes Pointer to data to return the minimum workspace size * @return miopenStatus_t @@ -5799,7 +5808,7 @@ MIOPEN_EXPORT miopenStatus_t miopenGetSumWorkspaceSize(miopenHandle_t handle, * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) * @param xDesc Tensor descriptor for data input tensor x (input) * @param x Data tensor x (input) - * @param dim Dimensions to sum. (input) + * @param dim Dimension to sum. (input) * @param yDesc Tensor descriptor for output data tensor y (input) * @param y Data tensor y (output) * @return miopenStatus_t @@ -5820,24 +5829,55 @@ MIOPEN_EXPORT miopenStatus_t miopenSumForward(miopenHandle_t handle, #ifdef MIOPEN_BETA_API -/*! @ingroup argmax - * @brief Find the index of the maximum value of a tensor across dimensions. +/*! @ingroup ReduceExtreme + * @enum miopenReduceExtremeOp_t + * Reduction Extreme operation types + */ +typedef enum +{ + MIOPEN_REDUCE_EXTREME_ARGMIN = + 1, /*!< the operation is getting the minimum index of the reduced elements */ + MIOPEN_REDUCE_EXTREME_ARGMAX = + 2, /*!< the operation is getting the maximum index of the reduced elements */ + MIOPEN_REDUCE_EXTREME_MIN = + 3, /*!< the operation is getting the minimum value and index of the reduced elements */ + MIOPEN_REDUCE_EXTREME_MAX = + 4, /*!< the operation is getting the maximum value and index of the reduced elements */ +} miopenReduceExtremeOp_t; + +// ReduceExtreme APIs +/** @addtogroup ReduceExtreme + * + * @{ + */ + +/*! @brief Find the the extreme (minimum, maximum) value and index of a tensor across Dimension. * * @param handle MIOpen handle (input) * @param xDesc Tensor descriptor for data input tensor x (input) * @param x Data tensor x (input) - * @param dim Dimensions to reduce argmax. (input) - * @param yDesc Tensor descriptor for output indice data tensor y (input) + * @param dim Dimension to reduce argmax. (input) + * @param reduceExtremeOp Enumerant specifying the operation used by ReduceExtreme (input) + * @param yDesc Tensor descriptor for reduce data tensor y (input) * @param y Data tensor y (output) + * @param indiceDesc Tensor descriptor for reduce data tensor indice only int32_t + * (input) + * @param indice Data tensor indice (output) * @return miopenStatus_t */ -MIOPEN_EXPORT miopenStatus_t miopenArgmaxForward(miopenHandle_t handle, - const miopenTensorDescriptor_t xDesc, - const void* x, - const int32_t dim, - const miopenTensorDescriptor_t yDesc, - void* y); +MIOPEN_EXPORT miopenStatus_t +miopenReduceExtremeForward(miopenHandle_t handle, + const miopenTensorDescriptor_t xDesc, + const void* x, + const int32_t dim, + const miopenReduceExtremeOp_t reduceExtremeOp, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t indiceDesc, + void* indice); +/** @} */ +// CLOSEOUT REDUCEEXTREME DOXYGEN GROUP #endif #ifdef MIOPEN_BETA_API @@ -5887,6 +5927,155 @@ MIOPEN_EXPORT miopenStatus_t miopenGroupNormForward(miopenHandle_t handle, // CLOSEOUT groupnorm DOXYGEN GROUP #endif +#ifdef MIOPEN_BETA_API +// LayerNorm APIs +/** @addtogroup layernorm + * + * @{ + */ +/*! @brief Execute a add and layernorm forward layer + * + * @param handle MIOpen handle (input) + * @param mode LayerNorm mode (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param x2Desc Tensor descriptor for data input tensor x2 (input) + * @param x2 Data tensor x2 (input) + * @param weightDesc Tensor descriptor for data input tensor weight (input) + * @param weight Data tensor weight (input) + * @param biasDesc Tensor descriptor for data input tensor bias (input) + * @param bias Data tensor bias (input) + * @param epsilon Value to stablize inverse variance calculation (input) + * @param normalized_dim Nomalized dimensions in the input array (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param meanDesc Tensor descriptor for output data tensor mean (input) + * @param mean Data tensor mean (output) + * @param rstdDesc Tensor descriptor for output data tensor rstd (input) + * @param rstd Data tensor rstd (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenAddLayerNormForward(miopenHandle_t handle, + miopenNormMode_t mode, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t x2Desc, + const void* x2, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t biasDesc, + const void* bias, + const float epsilon, + const int32_t normalized_dim, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t meanDesc, + void* mean, + const miopenTensorDescriptor_t rstdDesc, + void* rstd); + +/** @} */ +// CLOSEOUT LAYERNORM DOXYGEN GROUP +#endif + +#ifdef MIOPEN_BETA_API +// LayerNorm APIs +/** @addtogroup layernorm + * + * @{ + */ +/*! @brief Execute a T5layernorm forward layer + * + * @param handle MIOpen handle (input) + * @param mode LayerNorm mode (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param weightDesc Tensor descriptor for data input tensor weight (input) + * @param weight Data tensor weight (input) + * @param epsilon Value to stablize inverse variance calculation (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param rstdDesc Tensor descriptor for output data tensor rstd (input) + * @param rstd Data tensor rstd (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenT5LayerNormForward(miopenHandle_t handle, + miopenNormMode_t mode, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const float epsilon, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t rstdDesc, + void* rstd); + +/*! @brief Helper function to query the minimum workspace size required by the T5layernorm backward + * call + * + * @param handle MIOpen Handle (input) + * @param mode LayerNorm mode (input) + * @param dyDesc Tensor descriptor for data input tensor dy (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param weightDesc Tensor descriptor for data input tensor weight (input) + * @param rstdDesc Tensor descriptor for data input tensor rstd (input) + * @param dxDesc Tensor descriptor for output data tensor dx (input) + * @param dwDesc Tensor descriptor for output data tensor dw (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +extern "C" miopenStatus_t +miopenGetT5LayerNormBackwardWorkspaceSize(miopenHandle_t handle, + miopenNormMode_t mode, + const miopenTensorDescriptor_t dyDesc, + const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t weightDesc, + const miopenTensorDescriptor_t rstdDesc, + const miopenTensorDescriptor_t dxDesc, + const miopenTensorDescriptor_t dwDesc, + size_t* sizeInBytes); + +/*! @brief Execute a T5layernorm backward layer + * + * @param handle MIOpen handle (input) + * @param mode LayerNorm mode (input) + * @param workspace Address of the allocated workspace data (input) + * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) + * @param dyDesc Tensor descriptor for data input tensor dy (input) + * @param dy Data tensor dy (input) + * @param xDesc Tensor descriptor for output data tensor x (input) + * @param x Data tensor x (input) + * @param weightDesc Tensor descriptor for data input tensor weight (input) + * @param weight Data tensor weight (input) + * @param rstdDesc Tensor descriptor for output data tensor rstd (input) + * @param rstd Data tensor rstd (output) + * @param dxDesc Tensor descriptor for output data tensor dx (input) + * @param dx Data tensor dx (output) + * @param dwDesc Tensor descriptor for output data tensor dw (input) + * @param dw Data tensor dw (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenT5LayerNormBackward(miopenHandle_t handle, + miopenNormMode_t mode, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t rstdDesc, + const void* rstd, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t dwDesc, + void* dw); +/** @} */ +// CLOSEOUT LAYERNORM DOXYGEN GROUP +#endif + #ifdef MIOPEN_BETA_API // Graph API /** @addtogroup GraphAPI @@ -5977,6 +6166,7 @@ typedef enum MIOPEN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402, MIOPEN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403, MIOPEN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404, + MIOPEN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405, MIOPEN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500, MIOPEN_ATTR_INTERMEDIATE_INFO_SIZE = 501, @@ -6417,6 +6607,19 @@ typedef enum MIOPEN_RNG_DISTRIBUTION_NORMAL, } miopenRngDistribution_t; +/*! @brief Operation mode of CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR + * + * An enumerated type to indicate the operation mode of a CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR + */ +typedef enum +{ + MIOPEN_HEUR_MODE_INSTANT = 0, + MIOPEN_HEUR_MODE_B = 1, + MIOPEN_HEUR_MODE_FALLBACK = 2, + MIOPEN_HEUR_MODE_A = 3, + MIOPEN_HEUR_MODES_COUNT = 4, +} miopenBackendHeurMode_t; + /*! @brief Backend descriptor * * A typedef void pointer to one of many opaque descriptor structures. @@ -6607,6 +6810,325 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d // CLOSEOUT BackendAPI DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API +// FusedAdam APIs +/** @addtogroup SGD + * + * @{ + */ +/*! @brief Perform Fused Adam optimization for a single tensor (Adaptive Moment Estimation). + * + * This function implements the Fused Adam optimization algorithm. Adam, short for Adaptive Moment + * Estimation, extends the RMSProp optimizer. It combines the advantages of AdaGrad and RMSProp by + * adaptively adjusting learning rates for each parameter using the first and second moments of + * gradients. Fused Adam optimization efficiently combines multiple operations into a single kernel, + * reducing memory access overhead and improving performance. + * + * @code + * // Execute Adam + * miopenFusedAdam(handle, + * paramDesc, + * param, + * gradDesc, + * grad, + * expAvgDesc, + * expAvg, + * expAvgSqDesc, + * expAvgSq, + * NULL, // Unused maxExpAvgSqDesc because amsgrad is false + * NULL, + * NULL, // Unused stateStep Tensor because use step integer argument + * NULL, + * step, + * lr, + * beta1, + * beta2, + * weight_decay, + * eps, + * false, // amsgrad + * false, // maximize + * false, // adamw + * NULL, // Unused gradScale Tensor because not amp + * NULL, + * NULL, // Unused foundInf Tensor because not amp + * NULL); + * + * // Execute AMP Adam + * miopenFusedAdam(handle, + * paramDesc, + * param, + * gradDesc, + * grad, + * expAvgDesc, + * expAvg, + * expAvgSqDesc, + * expAvgSq, + * NULL, // Unused maxExpAvgSqDesc because amsgrad is false + * NULL, + * stateStepDesc, + * stateStep, + * -1, // Ignore step value because stateStep Tensor is used + * lr, + * beta1, + * beta2, + * weight_decay, + * eps, + * false, // amsgrad + * false, // maximize + * false, // adamw + * gradScaleDesc, + * gradScale, + * foundInfDesc, + * foundInf); + * @endcode + * + * @param handle MIOpen handle (input) + * @param paramDesc Tensor descriptor for the input parameter tensor (input) + * @param param Input parameter tensor (input) + * @param gradDesc Tensor descriptor for the input gradient tensor (input) + * @param grad Input gradient tensor (input) + * @param expAvgDesc Tensor descriptor for the input exponential moving average tensor + * (input) + * @param expAvg Input exponential moving average tensor (input) + * @param expAvgSqDesc Tensor descriptor for the input exponential moving average squared + * tensor (input) + * @param expAvgSq Input exponential moving average squared tensor (input) + * @param maxExpAvgSqDesc Tensor descriptor for the input maximum exponential moving average + * squared tensor. Used when amsgrad is true (input, optional) + * @param maxExpAvgSq Input maximum exponential moving average squared tensor. Used when + * amsgrad is true (input, optional) + * @param stateStepDesc Tensor descriptor for the input state step tensor (input) + * @param stateStep Input state step tensor (input) + * @param state_step Input state step. used when the step tensor is null (input) + * @param lr Learning rate (input) + * @param beta1 Coefficient used for computing the first moment running average of + * gradient (input) + * @param beta2 Coefficient used for computing the second moment running average of + * gradient (input) + * @param weight_decay Weight decay (input) + * @param eps Term added to the denominator to improve numerical stability (input) + * @param amsgrad Flag indicating whether to use the AMSGrad variant of Adam (input) + * @param maximize Flag indicating whether to maximize the objective with respect to the + * parameters (input) + * @param adamw If true, the operation becomes AdamW (input) (not supported) + * @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional) + * @param gradScale Input grad scale tensor (input, optional) + * @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional) + * @param foundInf Tensor indicating the presence of inf or NaN in gradients. If true, + * skips operation and step update (input, optional) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenFusedAdam(miopenHandle_t handle, + const miopenTensorDescriptor_t paramDesc, + void* param, + const miopenTensorDescriptor_t gradDesc, + const void* grad, + const miopenTensorDescriptor_t expAvgDesc, + void* expAvg, + const miopenTensorDescriptor_t expAvgSqDesc, + void* expAvgSq, + const miopenTensorDescriptor_t maxExpAvgSqDesc, + void* maxExpAvgSq, + const miopenTensorDescriptor_t stateStepDesc, + void* stateStep, + const unsigned int state_step, + const float lr, + const float beta1, + const float beta2, + const float weight_decay, + const float eps, + const bool amsgrad, + const bool maximize, + const bool adamw, + const miopenTensorDescriptor_t gradScaleDesc, + const void* gradScale, + const miopenTensorDescriptor_t foundInfDesc, + const void* foundInf); + +/*! @brief Execute single tensor Adam optimization and receive the result in a separate output + * tensor. + * + * This function is equivalent to miopenFusedAdam but receives the result in a separate output + * tensor. + * @see miopenFusedAdam + * + * @code + * // Execute Adam + * miopenFusedAdamWithOutput(handle, + * paramInDesc, + * paramIn, + * paramOutDesc, + * paramOut, + * NULL, // Unused paramOutFloat16 tensor because is not amp + * NULL, + * gradInDesc, + * gradIn, + * expAvgInDesc, + * expAvgIn, + * expAvgOutDesc, + * expAvgOut, + * expAvgInSqDesc, + * expAvgSqIn, + * expAvgSqOutDesc, + * expAvgSqOut, + * NULL, // Unused maxExpAvgSqIn tensor because amsgrad is false + * NULL, + * NULL, // Unused maxExpAvgSqOut tensor because amsgrad is false + * NULL, + * NULL, // Unused stateStepIn tensor because use step integer argument + * NULL, + * NULL, // Unused stateStepOut tensor because use step integer argument + * NULL, + * step, + * lr, + * beta1, + * beta2, + * weight_decay, + * eps, + * false, // amsgrad + * false, // maximize + * false, // adamw + * NULL, // Unused gradScale Tensor because not amp + * NULL, + * NULL, // Unused foundInf Tensor because not amp + * NULL); + * + * // Execute Amp Adam + * miopenFusedAdamWithOutput(handle, + * paramInDesc, + * paramIn, + * paramOutDesc, + * paramOut, + * paramOutFloat16Desc, // paramOutFloat16 tensor is optional in amp + * paramOutFloat16, + * gradInDesc, + * gradIn, + * expAvgInDesc, + * expAvgIn, + * expAvgOutDesc, + * expAvgOut, + * expAvgInSqDesc, + * expAvgSqIn, + * expAvgSqIn, + * expAvgSqOutDesc, + * expAvgSqOut, + * NULL, // Unused maxExpAvgSqIn tensor because amsgrad is false + * NULL, + * NULL, // Unused maxExpAvgSqOut tensor because amsgrad is false + * NULL, + * stateStepInDesc, + * stateStepIn, + * stateStepOutDesc, + * stateStepOut + * -1, // Ignore step value because stateStep Tensor is used + * lr, beta1, beta2, weight_decay, eps, + * false, // amsgrad + * false, // maximize + * false, // adamw + * gradScaleDesc, + * gradScale, + * foundInfDesc, + * foundInf); + * @endcode + * + * @param handle MIOpen handle (input) + * @param paramInDesc Tensor descriptor for the input parameter tensor (input) + * @param paramIn Input parameter tensor (input) + * @param paramOutDesc Tensor descriptor for the output parameter tensor (input) + * @param paramOut Output parameter tensor (output) + * @param paramOutFloat16Desc Tensor descriptor for the output parameter tensor float16 (input, + * optional) + * @param paramOutFloat16 Output parameter tensor (output, optional) + * @param gradInDesc Tensor descriptor for the input gradient tensor (input) + * @param gradIn Input gradient tensor (input) + * @param expAvgInDesc Tensor descriptor for the input exponential moving average tensor + * (input) + * @param expAvgIn Input exponential moving average tensor (input) + * @param expAvgOutDesc Tensor descriptor for the output exponential moving average tensor + * (input) + * @param expAvgOut Output exponential moving average tensor (output) + * @param expAvgSqInDesc Tensor descriptor for the input exponential moving average squared + * tensor (input) + * @param expAvgSqIn Input exponential moving average squared tensor (input) + * @param expAvgSqOutDesc Tensor descriptor for the output exponential moving average squared + * tensor (input) + * @param expAvgSqOut Output exponential moving average squared tensor (output) + * @param maxExpAvgSqInDesc Tensor descriptor for the input maximum exponential moving average + * squared tensor. Used when amsgrad is true (input, optional) + * @param maxExpAvgSqIn Input maximum exponential moving average squared tensor. Used when + * amsgrad is true (input, optional) + * @param maxExpAvgSqOutDesc Tensor descriptor for the output maximum exponential moving average + * squared tensor. Used when amsgrad is true (input, optional) + * @param maxExpAvgSqOut Output maximum exponential moving average squared tensor. Used when + * amsgrad is true (output, optional) + * @param stateStepInDesc Tensor descriptor for the input state step tensor (input, optional) + * @param stateStepIn Input state step tensor (input, optional) + * @param stateStepOutDesc Tensor descriptor for the output state step tensor (input, optional) + * @param stateStepOut Output state step tensor that stores the updated step value. (output, + * optional) + * @param state_step Input state step, It is used when the step tensor is null. (input) + * @param lr Learning rate (input) + * @param beta1 Coefficient used for computing the first moment running average of + * gradient (input) + * @param beta2 Coefficient used for computing the second moment running average of + * gradient (input) + * @param weight_decay Weight decay (input) + * @param eps Term added to the denominator to improve numerical stability (input) + * @param amsgrad Flag indicating whether to use the AMSGrad variant of Adam (input) + * @param maximize Flag indicating whether to maximize the objective with respect to the + * parameters (input) + * @param adamw If it is true, the operation becomes AdamW (input) (not supported) + * @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional) + * @param gradScale Input grad scale tensor (input, optional) + * @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional) + * @param foundInf Tensor indicating presence of inf or nan in gradients. If true, skips + * operation and step update. (input, optional) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenFusedAdamWithOutput(miopenHandle_t handle, + const miopenTensorDescriptor_t paramInDesc, + void* paramIn, + const miopenTensorDescriptor_t paramOutDesc, + void* paramOut, + const miopenTensorDescriptor_t paramOutFloat16Desc, + void* paramOutFloat16, + const miopenTensorDescriptor_t gradInDesc, + const void* gradIn, + const miopenTensorDescriptor_t expAvgInDesc, + void* expAvgIn, + const miopenTensorDescriptor_t expAvgOutDesc, + void* expAvgOut, + const miopenTensorDescriptor_t expAvgSqInDesc, + void* expAvgSqIn, + const miopenTensorDescriptor_t expAvgSqOutDesc, + void* expAvgSqOut, + const miopenTensorDescriptor_t maxExpAvgSqInDesc, + void* maxExpAvgSqIn, + const miopenTensorDescriptor_t maxExpAvgSqOutDesc, + void* maxExpAvgSqOut, + const miopenTensorDescriptor_t stateStepInDesc, + void* stateStepIn, + const miopenTensorDescriptor_t stateStepOutDesc, + void* stateStepOut, + const unsigned int state_step, + const float lr, + const float beta1, + const float beta2, + const float weight_decay, + const float eps, + const bool amsgrad, + const bool maximize, + const bool adamw, + const miopenTensorDescriptor_t gradScaleDesc, + const void* gradScale, + const miopenTensorDescriptor_t foundInfDesc, + const void* foundInf); + +/** @} */ +// CLOSEOUT SGD DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/requirements.txt b/requirements.txt index b2a5f64172..719acd5b19 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@f0759faff4a1c3ba5f739dfed468530e0ee9f28b -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@1d784873eec6d4c41454b8733272aa5da073fbc6 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2c38414952..89ae1da574 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -85,8 +85,10 @@ endfunction() set( MIOpen_Source activ/problem_description.cpp activ_api.cpp + adam/problem_description.cpp + adam_api.cpp + addlayernorm_api.cpp api/find2_0_commons.cpp - argmax_api.cpp batch_norm.cpp batch_norm_api.cpp batchnorm/problem_description.cpp @@ -97,10 +99,12 @@ set( MIOpen_Source conv/invokers/gcn_asm_1x1u.cpp conv/invokers/gcn_asm_1x1u_ss.cpp conv/invokers/gcn_asm_1x1u_us.cpp + conv/invokers/gcn_asm_wino.cpp conv/invokers/gen_x_w_y_pad.cpp conv/invokers/impl_gemm.cpp conv/invokers/impl_gemm_dynamic.cpp conv/invokers/ocl_wrw_rdc.cpp + conv/kernel_interface/winograd_kernel_interface.cpp conv/problem_description.cpp conv/solver_finders.cpp conv_algo_name.cpp @@ -122,9 +126,13 @@ set( MIOpen_Source fusion/problem_description.cpp generic_search.cpp graphapi/convolution.cpp + graphapi/engine.cpp + graphapi/enginecfg.cpp + graphapi/engineheur.cpp + graphapi/execution_plan.cpp graphapi/graphapi.cpp - graphapi/opgraph.cpp graphapi/matmul.cpp + graphapi/opgraph.cpp graphapi/pointwise.cpp graphapi/reduction.cpp graphapi/rng.cpp @@ -153,6 +161,7 @@ set( MIOpen_Source process.cpp ramdb.cpp readonlyramdb.cpp + reduceextreme_api.cpp reducetensor.cpp reducetensor_api.cpp reduce/problem_description.cpp @@ -169,6 +178,7 @@ set( MIOpen_Source solver/activ/bwd_1.cpp solver/activ/fwd_0.cpp solver/activ/fwd_1.cpp + solver/adam/adam.cpp solver/batchnorm/backward_ck.cpp solver/batchnorm/backward_per_activation.cpp solver/batchnorm/backward_per_activation_fused.cpp @@ -183,6 +193,7 @@ set( MIOpen_Source solver/batchnorm/forward_spatial_single.cpp solver/batchnorm/forward_training_ck.cpp solver/cat/forward_cat.cpp + solver/conv/conv_wino_fury_RxS.cpp solver/conv_asm_1x1u.cpp solver/conv_asm_1x1u_bias_activ_fused.cpp solver/conv_asm_1x1u_stride2.cpp @@ -254,7 +265,6 @@ set( MIOpen_Source solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp solver/conv_ocl_dir2Dfwd1x1.cpp solver/conv_ocl_dir2Dfwdgen.cpp - solver/conv_wino_fury_RxS.cpp solver/conv_winoRxS.cpp solver/conv_winoRxS_fused.cpp solver/fft.cpp @@ -263,9 +273,12 @@ set( MIOpen_Source solver/gemm_common.cpp solver/gemm_wrw.cpp solver/groupnorm/forward_groupnorm.cpp + solver/layernorm/backward_t5layernorm.cpp + solver/layernorm/forward_addlayernorm.cpp solver/layernorm/forward_layernorm.cpp solver/layernorm/forward_layernorm2d_ck.cpp solver/layernorm/forward_layernorm4d_ck.cpp + solver/layernorm/forward_t5layernorm.cpp solver/mha/mha_solver_backward.cpp solver/mha/mha_solver_forward.cpp solver/pooling/forward2d.cpp @@ -274,11 +287,15 @@ set( MIOpen_Source solver/pooling/backward2d.cpp solver/pooling/backwardNd.cpp solver/reduce/forward_argmax.cpp + solver/reduce/forward_argmin.cpp + solver/reduce/forward_max.cpp + solver/reduce/forward_min.cpp solver/reduce/forward_sum.cpp solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp subbuffers.cpp sum_api.cpp + t5layernorm_api.cpp target_properties.cpp temp_file.cpp tensor.cpp @@ -409,6 +426,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/Conv_Winograd_v30_3_1_gfx11_fp32_f3x2_stride1.inc kernels/Conv_Winograd_v30_3_1_gfx11_fp32_f3x2_stride2.inc kernels/Conv_Winograd_v30_3_1_metadata.inc + kernels/MIOpenReduceExtreme.hpp kernels/bfloat16_dev.hpp kernels/conv_common.inc kernels/conv_sizes.inc @@ -426,6 +444,10 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/rocm_version.inc kernels/stride_array.hpp kernels/utilities.inc + kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc + kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc + kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc + kernels/winograd/Conv_Winograd_Fury_v2_4_1_metadata.inc kernels/workaround_issue_1431.hpp kernels/xform_bidirect_winograd_code.inc kernels/xform_data_filter.inc @@ -442,7 +464,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ${GPU_REFERENCE_KERNEL_ASM} ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP} ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE} - kernels/MIOpenArgmax.cpp + kernels/MIOpenAdam.cpp kernels/MIOpenCat.cpp kernels/MIOpenCheckNumerics.cpp kernels/MIOpenBatchNormActivBwdPerAct.cl @@ -471,6 +493,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenConv1x1S.cl kernels/MIOpenConv1x1J1.cl kernels/MIOpenConv1x1J1_stride.cl + kernels/MIOpenReduceExtreme.cpp kernels/MIOpenSoftmax.cl kernels/MIOpenSoftmaxAttn.cpp kernels/MIOpenSum.cpp @@ -543,11 +566,12 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/Conv_Winograd_v30_3_1_fp32_f3x2_dilation2.s kernels/Conv_Winograd_v30_3_1_fp32_f3x2_stride1.s kernels/Conv_Winograd_v30_3_1_fp32_f3x2_stride2.s - kernels/Conv_Winograd_Fury_v1_1_1_fp16_fp16acc_f2x3_stride1.s kernels/MIOpenConvBwdBias.cl kernels/MIOpenBatchNormActivInfer.cl kernels/MIOpenCTCLoss.cl kernels/MIOpenDropout.cl + kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c16_stride1.s + kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c32_stride1.s kernels/xform_data.s kernels/xform_filter.s kernels/xform_bidirect_winograd_data.s @@ -580,11 +604,12 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN configure_file(db_path.cpp.in ${PROJECT_BINARY_DIR}/db_path.cpp) list(APPEND MIOpen_Source activ.cpp - argmax.cpp + adam.cpp + addlayernorm.cpp cat.cpp groupnorm.cpp kernel_cache.cpp - layer_norm.cpp + layernorm.cpp lrn.cpp mlo_dir_conv.cpp exec_utils.cpp @@ -605,8 +630,10 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN hip/batched_transpose_sol.cpp hip/general_tensor_reorder_sol.cpp pooling.cpp + t5layernorm.cpp ocl/fusionopconvocl.cpp ocl/fusionopbiasbnactivocl.cpp + reduceextreme.cpp sum.cpp ${PROJECT_BINARY_DIR}/db_path.cpp ) @@ -783,7 +810,7 @@ if(MIOPEN_ENABLE_AI_KERNEL_TUNING OR MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK) endif() foreach(MODEL_FILE ${MODEL_FILES}) get_filename_component(MODEL_FILE_FILENAME "${MODEL_FILE}" NAME) - configure_file("${MODEL_FILE}" "${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR}/${MODEL_FILE_FILENAME}" COPYONLY) + configure_file("${MODEL_FILE}" "${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR}/${MODEL_FILE_FILENAME}" COPYONLY) endforeach() endif() diff --git a/src/adam.cpp b/src/adam.cpp new file mode 100644 index 0000000000..c21deeebb7 --- /dev/null +++ b/src/adam.cpp @@ -0,0 +1,144 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t Adam(Handle& handle, + const TensorDescriptor& paramInDesc, + ConstData_t paramIn, + const TensorDescriptor& paramOutDesc, + Data_t paramOut, + const TensorDescriptor& paramOutFloat16Desc, + Data_t paramOutFloat16, + const TensorDescriptor& gradInDesc, + ConstData_t gradIn, + const TensorDescriptor& expAvgInDesc, + ConstData_t expAvgIn, + const TensorDescriptor& expAvgOutDesc, + Data_t expAvgOut, + const TensorDescriptor& expAvgSqInDesc, + ConstData_t expAvgSqIn, + const TensorDescriptor& expAvgSqOutDesc, + Data_t expAvgSqOut, + const TensorDescriptor& maxExpAvgSqInDesc, + ConstData_t maxExpAvgSqIn, + const TensorDescriptor& maxExpAvgSqOutDesc, + Data_t maxExpAvgSqOut, + const TensorDescriptor& gradScaleDesc, + ConstData_t gradScale, + const TensorDescriptor& foundInfDesc, + ConstData_t foundInf, + const TensorDescriptor& stepInDesc, + ConstData_t stepIn, + const TensorDescriptor& stepOutDesc, + Data_t stepOut, + const uint32_t step, + const float lr, + const float beta1, + const float beta2, + const float weight_decay, + const float eps, + const bool amsgrad, + const bool maximize, + const bool adamw, + const bool is_amp) +{ + const auto problem = adam::ProblemDescription{paramInDesc, + paramOutDesc, + paramOutFloat16Desc, + gradInDesc, + expAvgInDesc, + expAvgOutDesc, + expAvgSqInDesc, + expAvgSqOutDesc, + maxExpAvgSqInDesc, + maxExpAvgSqOutDesc, + gradScaleDesc, + foundInfDesc, + stepInDesc, + stepOutDesc, + step, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + adamw, + is_amp}; + + const auto invoke_params = [&]() { + auto tmp = adam::InvokeParams{}; + tmp.type = InvokeType::Run; + + tmp.paramDesc = ¶mInDesc; + tmp.gradDesc = &gradInDesc; + tmp.paramIn = paramIn; + tmp.paramOut = paramOut; + tmp.paramOutFloat16 = paramOutFloat16; + tmp.gradIn = gradIn; + tmp.expAvgIn = expAvgIn; + tmp.expAvgOut = expAvgOut; + tmp.expAvgSqIn = expAvgSqIn; + tmp.expAvgSqOut = expAvgSqOut; + tmp.maxExpAvgSqIn = maxExpAvgSqIn; + tmp.maxExpAvgSqOut = maxExpAvgSqOut; + tmp.gradScale = gradScale; + tmp.foundInf = foundInf; + tmp.stepIn = stepIn; + tmp.stepOut = stepOut; + + tmp.step = step; + tmp.lr = lr; + tmp.beta1 = beta1; + tmp.beta2 = beta2; + tmp.weight_decay = weight_decay; + tmp.eps = eps; + tmp.amsgrad = amsgrad; + tmp.maximize = maximize; + tmp.adamw = adamw; + + return tmp; + }(); + + const auto algo = AlgorithmName{"Adam"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/adam/problem_description.cpp b/src/adam/problem_description.cpp new file mode 100644 index 0000000000..710b3594ff --- /dev/null +++ b/src/adam/problem_description.cpp @@ -0,0 +1,63 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include + +#include + +namespace miopen { + +namespace adam { + +NetworkConfig ProblemDescription::MakeNetworkConfig() const +{ + auto dtype = paramInDesc.GetType(); + auto kernel = IsAmp() ? "ampadam" : "adam"; + auto step_ind = ExistStepTensor() ? "device" : "host"; + + std::ostringstream ss; + + ss << kernel; + if(IsAdamW()) + ss << "w"; + if(IsAllPacked()) + ss << "packed"; + ss << "step" << step_ind; + ss << "dtype" << dtype; + if(IsAmp()) + { + auto grad_dtype = gradInDesc.GetType(); + ss << "grad_dtype" << grad_dtype; + } + + return NetworkConfig{ss.str()}; +} + +} // namespace adam + +} // namespace miopen diff --git a/src/adam_api.cpp b/src/adam_api.cpp new file mode 100644 index 0000000000..3911b9fc9d --- /dev/null +++ b/src/adam_api.cpp @@ -0,0 +1,306 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include + +static void LogCmdAdam(const miopenTensorDescriptor_t paramDesc, + const float lr, + const float beta1, + const float beta2, + const float weight_decay, + const float eps, + const bool amsgrad, + const bool maximize, + const bool adamw, + const bool is_amp) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(paramDesc).GetType(); + if(is_amp) + { + ss << "ampadam"; + } + else + { + ss << "adam"; + } + if(adamw) + { + ss << "w"; + } + + if(dtype == miopenHalf) + { + ss << "fp16"; + } + + std::string batch_sz; + auto dims = miopen::deref(paramDesc).GetLengths(); + for(auto dim : dims) + { + batch_sz += std::to_string(dim); + batch_sz += "x"; + } + batch_sz.pop_back(); + ss << " -d " << batch_sz << " -l " << lr << " -1 " << beta1 << " -2 " << beta2 << " -e " + << eps << " -W " << weight_decay << " -a " << amsgrad << " -m " << maximize; + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +#define CHECK_DESC_EXIST(desc) (((desc) != nullptr) ? miopen::deref(desc) : dummyDesc) + +extern "C" miopenStatus_t miopenFusedAdam(miopenHandle_t handle, + const miopenTensorDescriptor_t paramDesc, + void* param, + const miopenTensorDescriptor_t gradDesc, + const void* grad, + const miopenTensorDescriptor_t expAvgDesc, + void* expAvg, + const miopenTensorDescriptor_t expAvgSqDesc, + void* expAvgSq, + const miopenTensorDescriptor_t maxExpAvgSqDesc, + void* maxExpAvgSq, + const miopenTensorDescriptor_t stateStepDesc, + void* stateStep, + const unsigned int state_step, + const float lr, + const float beta1, + const float beta2, + const float weight_decay, + const float eps, + const bool amsgrad, + const bool maximize, + const bool adamw, + const miopenTensorDescriptor_t gradScaleDesc, + const void* gradScale, + const miopenTensorDescriptor_t foundInfDesc, + const void* foundInf) +{ + MIOPEN_LOG_FUNCTION(handle, + paramDesc, + param, + gradDesc, + grad, + expAvgDesc, + expAvg, + expAvgSqDesc, + expAvgSq, + maxExpAvgSqDesc, + maxExpAvgSq, + stateStepDesc, + stateStep, + state_step, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + adamw, + gradScaleDesc, + gradScale, + foundInfDesc, + foundInf); + + const miopen::TensorDescriptor dummyDesc; + bool is_amp = (foundInfDesc != nullptr || gradScaleDesc != nullptr); + + LogCmdAdam(paramDesc, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, adamw, is_amp); + + return miopen::try_([&] { + miopen::Adam(miopen::deref(handle), + miopen::deref(paramDesc), + DataCast(param), + miopen::deref(paramDesc), + DataCast(param), + dummyDesc, + nullptr, + miopen::deref(gradDesc), + DataCast(grad), + miopen::deref(expAvgDesc), + DataCast(expAvg), + miopen::deref(expAvgDesc), + DataCast(expAvg), + miopen::deref(expAvgSqDesc), + DataCast(expAvgSq), + miopen::deref(expAvgSqDesc), + DataCast(expAvgSq), + CHECK_DESC_EXIST(maxExpAvgSqDesc), + DataCast(maxExpAvgSq), + CHECK_DESC_EXIST(maxExpAvgSqDesc), + DataCast(maxExpAvgSq), + CHECK_DESC_EXIST(gradScaleDesc), + DataCast(gradScale), + CHECK_DESC_EXIST(foundInfDesc), + DataCast(foundInf), + CHECK_DESC_EXIST(stateStepDesc), + DataCast(stateStep), + CHECK_DESC_EXIST(stateStepDesc), + DataCast(stateStep), + state_step, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + adamw, + is_amp); + }); +} + +extern "C" miopenStatus_t +miopenFusedAdamWithOutput(miopenHandle_t handle, + const miopenTensorDescriptor_t paramInDesc, + void* paramIn, + const miopenTensorDescriptor_t paramOutDesc, + void* paramOut, + const miopenTensorDescriptor_t paramOutFloat16Desc, + void* paramOutFloat16, + const miopenTensorDescriptor_t gradInDesc, + const void* gradIn, + const miopenTensorDescriptor_t expAvgInDesc, + void* expAvgIn, + const miopenTensorDescriptor_t expAvgOutDesc, + void* expAvgOut, + const miopenTensorDescriptor_t expAvgSqInDesc, + void* expAvgSqIn, + const miopenTensorDescriptor_t expAvgSqOutDesc, + void* expAvgSqOut, + const miopenTensorDescriptor_t maxExpAvgSqInDesc, + void* maxExpAvgSqIn, + const miopenTensorDescriptor_t maxExpAvgSqOutDesc, + void* maxExpAvgSqOut, + const miopenTensorDescriptor_t stateStepInDesc, + void* stateStepIn, + const miopenTensorDescriptor_t stateStepOutDesc, + void* stateStepOut, + const unsigned int state_step, + const float lr, + const float beta1, + const float beta2, + const float weight_decay, + const float eps, + const bool amsgrad, + const bool maximize, + const bool adamw, + const miopenTensorDescriptor_t gradScaleDesc, + const void* gradScale, + const miopenTensorDescriptor_t foundInfDesc, + const void* foundInf) +{ + MIOPEN_LOG_FUNCTION(handle, + paramInDesc, + paramIn, + paramOutDesc, + paramOut, + gradInDesc, + gradIn, + expAvgInDesc, + expAvgIn, + expAvgOutDesc, + expAvgOut, + expAvgSqInDesc, + expAvgSqIn, + expAvgSqOutDesc, + expAvgSqOut, + maxExpAvgSqInDesc, + maxExpAvgSqIn, + maxExpAvgSqOutDesc, + maxExpAvgSqOut, + stateStepInDesc, + stateStepIn, + stateStepOutDesc, + stateStepOut, + state_step, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + adamw, + gradScaleDesc, + gradScale, + foundInfDesc, + foundInf); + + const miopen::TensorDescriptor dummyDesc; + bool is_amp = (foundInfDesc != nullptr || gradScaleDesc != nullptr); + + LogCmdAdam(paramInDesc, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, adamw, is_amp); + + return miopen::try_([&] { + miopen::Adam(miopen::deref(handle), + miopen::deref(paramInDesc), + DataCast(paramIn), + miopen::deref(paramOutDesc), + DataCast(paramOut), + CHECK_DESC_EXIST(paramOutFloat16Desc), + DataCast(paramOutFloat16), + miopen::deref(gradInDesc), + DataCast(gradIn), + miopen::deref(expAvgInDesc), + DataCast(expAvgIn), + miopen::deref(expAvgOutDesc), + DataCast(expAvgOut), + miopen::deref(expAvgSqInDesc), + DataCast(expAvgSqIn), + miopen::deref(expAvgSqOutDesc), + DataCast(expAvgSqOut), + CHECK_DESC_EXIST(maxExpAvgSqInDesc), + DataCast(maxExpAvgSqIn), + CHECK_DESC_EXIST(maxExpAvgSqOutDesc), + DataCast(maxExpAvgSqOut), + CHECK_DESC_EXIST(gradScaleDesc), + DataCast(gradScale), + CHECK_DESC_EXIST(foundInfDesc), + DataCast(foundInf), + CHECK_DESC_EXIST(stateStepInDesc), + DataCast(stateStepIn), + CHECK_DESC_EXIST(stateStepOutDesc), + DataCast(stateStepOut), + state_step, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + adamw, + is_amp); + }); +} diff --git a/src/addlayernorm.cpp b/src/addlayernorm.cpp new file mode 100644 index 0000000000..271c81ca60 --- /dev/null +++ b/src/addlayernorm.cpp @@ -0,0 +1,92 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t AddLayerNormForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& x2Desc, + ConstData_t x2, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& biasDesc, + ConstData_t bias, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& meanDesc, + Data_t mean, + const TensorDescriptor& rstdDesc, + Data_t rstd, + miopenNormMode_t mode, + float epsilon, + int32_t normalized_dim) +{ + const auto problem = layernorm::ProblemDescription{mode, + xDesc, + x2Desc, + weightDesc, + biasDesc, + yDesc, + meanDesc, + rstdDesc, + epsilon, + normalized_dim}; + + const auto invoke_params = [&]() { + auto tmp = layernorm::AddInvokeParams{}; + tmp.type = InvokeType::Run; + tmp.xDesc = &xDesc; + tmp.x = x; + tmp.x2 = x2; + tmp.weight = weight; + tmp.bias = bias; + tmp.y = y; + tmp.mean = mean; + tmp.rstd = rstd; + tmp.epsilon = epsilon; + tmp.normalized_dim = normalized_dim; + tmp.mode = mode; + return tmp; + }(); + + const auto algo = AlgorithmName{"AddLayerNormForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/addlayernorm_api.cpp b/src/addlayernorm_api.cpp new file mode 100644 index 0000000000..8b9ed7e969 --- /dev/null +++ b/src/addlayernorm_api.cpp @@ -0,0 +1,138 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +static void +LogCmdAddLayerNorm(const miopenTensorDescriptor_t xDesc, const miopenNormMode_t mode, bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(xDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "addlayernormfp16"; + } + else if(dtype == miopenFloat) + { + ss << "addlayernormfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "addlayernormbfp16"; + } + + int32_t size = {0}; + miopenGetTensorDescriptorSize(xDesc, &size); + ss << " -n " << miopen::deref(xDesc).GetLengths()[0] << " -c " + << miopen::deref(xDesc).GetLengths()[1]; + if(size == 5) + { + ss << " -D " << miopen::deref(xDesc).GetLengths()[2] << " -H " + << miopen::deref(xDesc).GetLengths()[3] << " -W " + << miopen::deref(xDesc).GetLengths()[4]; + } + else if(size == 4) + { + ss << " -H " << miopen::deref(xDesc).GetLengths()[2] << " -W " + << miopen::deref(xDesc).GetLengths()[3]; + } + else if(size == 3) + { + ss << " -W " << miopen::deref(xDesc).GetLengths()[2]; + } + + ss << " -F " << ((is_fwd) ? "1" : "2") << " -m " << mode; + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenAddLayerNormForward(miopenHandle_t handle, + miopenNormMode_t mode, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t x2Desc, + const void* x2, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t biasDesc, + const void* bias, + const float epsilon, + const int32_t normalized_dim, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t meanDesc, + void* mean, + const miopenTensorDescriptor_t rstdDesc, + void* rstd) +{ + MIOPEN_LOG_FUNCTION(handle, + mode, + xDesc, + x, + x2Desc, + x2, + weightDesc, + weight, + biasDesc, + bias, + epsilon, + normalized_dim, + yDesc, + y, + meanDesc, + mean, + rstdDesc, + rstd); + + LogCmdAddLayerNorm(xDesc, mode, true); + return miopen::try_([&] { + miopen::AddLayerNormForward(miopen::deref(handle), + miopen::deref(xDesc), + DataCast(x), + miopen::deref(x2Desc), + DataCast(x2), + miopen::deref(weightDesc), + DataCast(weight), + miopen::deref(biasDesc), + DataCast(bias), + miopen::deref(yDesc), + DataCast(y), + miopen::deref(meanDesc), + DataCast(mean), + miopen::deref(rstdDesc), + DataCast(rstd), + mode, + epsilon, + normalized_dim); + }); +} diff --git a/src/conv/invokers/gcn_asm_wino.cpp b/src/conv/invokers/gcn_asm_wino.cpp new file mode 100644 index 0000000000..8a574cf292 --- /dev/null +++ b/src/conv/invokers/gcn_asm_wino.cpp @@ -0,0 +1,162 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include + +namespace miopen { +namespace conv { + +InvokerFactory MakeGcnAsmWinoV2InvokerFactory(const WinoShaderArgsV2& args, + Direction direction, + std::size_t sync_buffer_size) +{ + const bool is_backWrW = (direction == Direction::BackwardWeights); + const bool coop_launch = (args.sync_period != 0); + + return [=](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& primitive_params) { + const auto k = handle.Run(kernels[0], coop_launch); + + ConstData_t data_addr = !is_backWrW + ? primitive_params.CastTo().tensors.in + : primitive_params.CastTo().tensors.x; + ConstData_t filter_addr = !is_backWrW + ? primitive_params.CastTo().tensors.w + : primitive_params.CastTo().tensors.dy; + Data_t output_addr = !is_backWrW + ? primitive_params.CastTo().tensors.out + : primitive_params.CastTo().tensors.dw; + Data_t sync_addr = nullptr; + if(coop_launch) + { + sync_addr = !is_backWrW ? primitive_params.CastTo().workSpace + : primitive_params.CastTo().workSpace; + } + + uint64_t bias_addr = 0; + uint64_t acc_addr = 0; + + uint64_t d_offset = 0; + uint64_t f_offset = 0; + uint64_t o_offset = 0; + + uint64_t b_offset = 0; + uint64_t a_offset = 0; + + // clang-format off + MIOPEN_LOG_I2(" N=" << args.N << " C=" << args.C << " H=" << args.H << " W=" << args.W + << " K=" << args.K << " R=" << args.R << " S=" << args.S + << " pad_H=" << args.pad_h << " pad_W=" << args.pad_w + << " out_H=" << args.out_h << " out_W=" << args.out_w + << " G=" << args.G + << " alpha=" << args.alpha << " beta=" << args.beta << " act_mode=" << args.activation_mode + << " d_offset=" << d_offset << " f_offset=" << f_offset + << " o_offset=" << o_offset << " b_offset=" << b_offset + << " d_N_stride=" << args.d_N_stride << " d_C_stride=" << args.d_C_stride + << " d_H_stride=" << args.d_H_stride << " d_G_stride=" << args.d_G_stride + << " f_K_stride=" << args.f_K_stride << " f_C_stride=" << args.f_C_stride + << " f_R_stride=" << args.f_R_stride << " f_G_stride=" << args.f_G_stride + << " o_N_stride=" << args.o_N_stride << " o_K_stride=" << args.o_K_stride + << " o_H_stride=" << args.o_H_stride << " o_G_stride=" << args.o_G_stride + << " n_groups=" << args.n_groups << " flags64=" << args.flags64 + << " sync_limit=" << static_cast(args.sync_limit) + << " sync_period=" << static_cast(args.sync_period)); + // clang-format on + + if(coop_launch) + { + // Sync buffer that has to be zeroed before each shader dispatch +#if MIOPEN_BACKEND_HIP + auto status = hipMemsetAsync(sync_addr, 0, sync_buffer_size, handle.GetStream()); + if(status != hipSuccess) + MIOPEN_THROW_HIP_STATUS(status, "hipMemsetAsync() failed"); +#else +#error "Unsupported backend" +#endif + } + + // clang-format off + // Any reserved fields should be set to 0 + k(args.N, // uint32_t, batch size + args.C, // uint32_t, number of input channels in each filter group + args.H, // uint32_t, input height + args.W, // uint32_t, input width + args.K, // uint32_t, number of output channels in each filter group + args.n_groups, // uint32_t, number of shader groups + args.flags64, // uint64_t, shader flags + data_addr, // uint64_t, address of input tensor + filter_addr, // uint64_t, address of filter tensor + output_addr, // uint64_t, address of output tensor + static_cast(0), // uint64_t, not used, for backward compatibility only + args.R, // uint32_t, filter height + args.S, // uint32_t, filter width + args.pad_h, // int32_t, padding in h dimension + args.pad_w, // int32_t, padding in w dimension + args.out_h, // uint32_t, output height + args.out_w, // uint32_t, output width + bias_addr, // uint64_t, address of bias buffer + args.alpha, // fp32, activation parameter alpha + args.beta, // fp32, activation parameter beta + d_offset, // uint64_t, byte offset for buffer referenced by data_addr + f_offset, // uint64_t, byte offset for buffer referenced by filter_addr + o_offset, // uint64_t, byte offset for buffer referenced by output_addr + b_offset, // uint64_t, byte offset for buffer referenced by bias_addr + args.d_N_stride, // uint32_t, stride in number of elements of the N dimension of the input data buffer + args.d_C_stride, // uint32_t, stride in number of elements of the C dimension of the input data buffer + args.d_H_stride, // uint32_t, stride in number of elements of the H dimension of the input data buffer + static_cast(0), // uint32_t, reserved + args.f_K_stride, // uint32_t, stride in number of elements of the K dimension of the filter buffer + args.f_C_stride, // uint32_t, stride in number of elements of the C dimension of the filter buffer + args.f_R_stride, // uint32_t, stride in number of elements of the R dimension of the filter buffer + static_cast(0), // uint32_t, reserved + args.o_N_stride, // uint32_t, stride in number of elements of the N dimension of the output buffer + args.o_K_stride, // uint32_t, stride in number of elements of the K dimension of the output buffer + args.o_H_stride, // uint32_t, stride in number of elements of the H dimension of the output buffer + static_cast(0), // uint32_t, reserved + args.G, // uint32_t, number of filter groups + args.d_G_stride, // uint32_t, stride in number of elements of the G dimension of the input data buffer + args.f_G_stride, // uint32_t, stride in number of elements of the G dimension of the filter buffer + args.o_G_stride, // uint32_t, stride in number of elements of the G dimension of the output buffer + args.activation_mode, // uint8_t, activation mode + args.sync_limit, // uint8_t, maximum number of sync attempts + args.sync_period, // uint8_t, synchronization period + static_cast(0), // uint8_t, reserved + static_cast(0), // uint32_t, reserved + sync_addr, // uint64_t, address of sync buffer + acc_addr, // uint64_t, address of accumulation buffer + a_offset); // uint64_t, byte offset for buffer referenced by acc_addr + // clang-format on + }; + }; +} + +} // namespace conv +} // namespace miopen diff --git a/src/conv/kernel_interface/winograd_kernel_interface.cpp b/src/conv/kernel_interface/winograd_kernel_interface.cpp new file mode 100644 index 0000000000..975d159d43 --- /dev/null +++ b/src/conv/kernel_interface/winograd_kernel_interface.cpp @@ -0,0 +1,226 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include + +namespace miopen { +namespace conv { + +namespace { + +template +bool AssignAndCheck(Tdst& dst_v, Tsrc src_v) noexcept +{ + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + + dst_v = src_v; + + if(dst_v != src_v) + return false; + + if constexpr(std::numeric_limits::is_signed) + { + if constexpr(std::numeric_limits::is_signed) + return (dst_v >= 0 && src_v >= 0) || (dst_v < 0 && src_v < 0); + else + return src_v >= 0; + } + else if constexpr(std::numeric_limits::is_signed) + { + return dst_v >= 0; + } + + return true; +} + +} // namespace + +bool WinoShaderArgsV2::SetConvParams(const ProblemDescription& problem) +{ + if(!problem.Is2d()) + return false; + if(problem.GetBias() != 0) + return false; + if(!(problem.GetInStrideW() == 1 && problem.GetWeightsStrideW() == 1 && + problem.GetOutStrideW() == 1)) + { + return false; + } + + if(!AssignAndCheck(G, problem.GetGroupCount())) + return false; + + const auto in_c_per_group = problem.GetInChannels() / G; + const auto out_c_per_group = problem.GetOutChannels() / G; + + if(!problem.IsDirectionBackwardWrW()) + { + if(!AssignAndCheck(N, problem.GetBatchSize())) + return false; + if(!AssignAndCheck(C, in_c_per_group)) + return false; + if(!AssignAndCheck(H, problem.GetInHeight())) + return false; + if(!AssignAndCheck(W, problem.GetInWidth())) + return false; + if(!AssignAndCheck(K, out_c_per_group)) + return false; + if(!AssignAndCheck(R, problem.GetWeightsHeight())) + return false; + if(!AssignAndCheck(S, problem.GetWeightsWidth())) + return false; + if(!AssignAndCheck(out_h, problem.GetOutHeight())) + return false; + if(!AssignAndCheck(out_w, problem.GetOutWidth())) + return false; + } + else + { + if(!AssignAndCheck(N, out_c_per_group)) + return false; + if(!AssignAndCheck(C, problem.GetBatchSize())) + return false; + if(!AssignAndCheck(H, problem.GetOutHeight())) + return false; + if(!AssignAndCheck(W, problem.GetOutWidth())) + return false; + if(!AssignAndCheck(K, in_c_per_group)) + return false; + if(!AssignAndCheck(R, problem.GetInHeight())) + return false; + if(!AssignAndCheck(S, problem.GetInWidth())) + return false; + if(!AssignAndCheck(out_h, problem.GetWeightsHeight())) + return false; + if(!AssignAndCheck(out_w, problem.GetWeightsWidth())) + return false; + } + + if(!problem.IsDirectionBackwardData()) + { + if(!AssignAndCheck(pad_h, problem.GetPadH())) + return false; + if(!AssignAndCheck(pad_w, problem.GetPadW())) + return false; + } + else + { + if(!AssignAndCheck(pad_h, problem.GetBackwardPadH())) + return false; + if(!AssignAndCheck(pad_w, problem.GetBackwardPadW())) + return false; + } + + if(problem.GetInBatchStride() > std::numeric_limits::max() || + problem.GetInChannelStride() > std::numeric_limits::max() || + problem.GetInStrideH() > std::numeric_limits::max()) + return false; + if(problem.GetWeightsStrideK() > std::numeric_limits::max() || + problem.GetWeightsStrideC() > std::numeric_limits::max() || + problem.GetWeightsStrideH() > std::numeric_limits::max()) + return false; + if(problem.GetOutBatchStride() > std::numeric_limits::max() || + problem.GetOutChannelStride() > std::numeric_limits::max() || + problem.GetOutStrideH() > std::numeric_limits::max()) + return false; + + return true; +} + +void WinoShaderArgsV2::SetStrides(const ProblemDescription& problem) +{ + MemLayout_t d_layout, o_layout, f_layout; + + if(!problem.IsDirectionBackwardWrW()) + { + d_layout = GetGroupConvLayout(GetMemLayout_t(problem.GetInLayout()), true); + o_layout = GetGroupConvLayout(GetMemLayout_t(problem.GetOutLayout()), true); + // clang-format off + f_layout = GetGroupConvLayout(problem.IsDirectionForward() ? MemLayout_t::NCHW + : GetSwappedNCLayout(MemLayout_t::NCHW), false); + // clang-format on + } + else + { + d_layout = + GetGroupConvLayout(GetSwappedNCLayout(GetMemLayout_t(problem.GetInLayout())), true); + o_layout = + GetGroupConvLayout(GetSwappedNCLayout(GetMemLayout_t(problem.GetOutLayout())), false); + f_layout = GetGroupConvLayout(GetSwappedNCLayout(MemLayout_t::NCHW), true); + } + + // TODO Make a constructor that takes unsigned int + BuffInfo d_buf(d_layout, N, C, H, W, G, GetTypeSize(problem.GetInDataType())); + BuffInfo o_buf(o_layout, N, K, out_h, out_w, G, GetTypeSize(problem.GetOutDataType())); + BuffInfo f_buf(f_layout, K, C, R, S, G, GetTypeSize(problem.GetWeightsDataType())); + + const auto& d_strides = d_buf.stride; + const auto& f_strides = f_buf.stride; + const auto& o_strides = o_buf.stride; + + d_N_stride = d_strides.nk; + d_C_stride = d_strides.c; + d_H_stride = d_strides.h; + d_G_stride = d_strides.g; + + f_K_stride = f_strides.nk; + f_C_stride = f_strides.c; + f_R_stride = f_strides.h; + f_G_stride = f_strides.g; + + o_N_stride = o_strides.nk; + o_K_stride = o_strides.c; + o_H_stride = o_strides.h; + o_G_stride = o_strides.g; +} + +void WinoShaderArgsV2::SetActivParams(WinoShaderActivationModeV2_t mode, + float alpha_, + float beta_) noexcept +{ + // Fused activation parameters + activation_mode = mode; + alpha = alpha_; + beta = beta_; +} + +void WinoShaderArgsV2::SetShaderParams(uint32_t n_groups_, + WinoShaderFlagsV2 flags_, + uint8_t sync_limit_, + uint8_t sync_period_) noexcept +{ + n_groups = n_groups_; + flags64 = flags_; + sync_limit = sync_limit_; + sync_period = sync_period_; +} + +} // namespace conv +} // namespace miopen diff --git a/src/graphapi/engine.cpp b/src/graphapi/engine.cpp new file mode 100644 index 0000000000..dd629eee6b --- /dev/null +++ b/src/graphapi/engine.cpp @@ -0,0 +1,215 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include + +namespace miopen { + +namespace graphapi { + +EngineBuilder& EngineBuilder::setOpGraph(const OpGraph* opGraph) +{ + mOpGraph = checkPtr(opGraph); + return *this; +} + +EngineBuilder& EngineBuilder::setGlobalIndex(int64_t globalIndex) +{ + if(globalIndex >= 0) + { + mGlobalIndex = globalIndex; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + return *this; +} + +EngineBuilder& EngineBuilder::setSmCount(int32_t smCount) +{ + if(smCount >= 0) + { + mSmCount = smCount; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + return *this; +} + +Engine EngineBuilder::build() +{ + if(mOpGraph != nullptr && mGlobalIndexSet && mGlobalIndex < mOpGraph->getEngines().size()) + { + // TODO: validate mSmCount + Engine engine = mOpGraph->getEngines()[mGlobalIndex]; + engine.mGlobalIndex = mGlobalIndex; + engine.mSmCount = mSmCount; + return engine; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendEngineDescriptor::setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_ENGINE_OPERATION_GRAPH: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && elementCount == 1) + { + miopenBackendDescriptor_t& apiDescriptor = + deref(static_cast(arrayOfElements)); + BackendDescriptor& backendDescriptor = deref(apiDescriptor); + + if(!backendDescriptor.isFinalized()) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + BackendOperationGraphDescriptor& operationGraphDescriptor = + dynamic_cast(backendDescriptor); + mBuilder.setOpGraph(operationGraphDescriptor.getOperationGraph()); + mOpGraphDescriptor = apiDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINE_GLOBAL_INDEX: + if(attributeType == MIOPEN_TYPE_INT64 && elementCount == 1) + { + mBuilder.setGlobalIndex(*static_cast(arrayOfElements)); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINE_SM_COUNT_TARGET: + if(attributeType == MIOPEN_TYPE_INT32 && elementCount == 1) + { + mBuilder.setSmCount(*static_cast(arrayOfElements)); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendEngineDescriptor::finalize() +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } +} + +void BackendEngineDescriptor::getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) +{ + if(!mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_ENGINE_OPERATION_GRAPH: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mOpGraphDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINE_GLOBAL_INDEX: + if(attributeType == MIOPEN_TYPE_INT64 && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mEngine.getGlobalIndex(); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINE_SM_COUNT_TARGET: + if(attributeType == MIOPEN_TYPE_INT32 && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mEngine.getSmCount(); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINE_BEHAVIOR_NOTE: + case MIOPEN_ATTR_ENGINE_KNOB_INFO: + case MIOPEN_ATTR_ENGINE_LAYOUT_INFO: + case MIOPEN_ATTR_ENGINE_NUMERICAL_NOTE: + // TODO: figure out what we can return here + *elementCount = 0; + break; + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +} // namespace graphapi + +} // namespace miopen diff --git a/src/graphapi/enginecfg.cpp b/src/graphapi/enginecfg.cpp new file mode 100644 index 0000000000..44a583e64a --- /dev/null +++ b/src/graphapi/enginecfg.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +namespace miopen { + +namespace graphapi { + +EngineCfg EngineCfgBuilder::build() & +{ + if(mEngineSet) + { + return mEngineCfg; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } +} + +EngineCfg EngineCfgBuilder::build() && +{ + if(mEngineSet) + { + return std::move(mEngineCfg); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendEngineCfgDescriptor::setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_ENGINECFG_ENGINE: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && elementCount == 1) + { + miopenBackendDescriptor_t apiDescriptor = + deref(static_cast(arrayOfElements)); + BackendDescriptor& backendDescriptor = deref(apiDescriptor); + + if(!backendDescriptor.isFinalized()) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + BackendEngineDescriptor& engineDescriptor = + dynamic_cast(backendDescriptor); + mBuilder.setEngine(engineDescriptor.getEngine()); + mEngineDescriptor = apiDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINECFG_KNOB_CHOICES: + if(attributeType != MIOPEN_TYPE_BACKEND_DESCRIPTOR || elementCount < 0) + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendEngineCfgDescriptor::finalize() +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + mEngineCfg = mBuilder.build(); + mFinalized = true; +} + +void BackendEngineCfgDescriptor::getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) +{ + if(!mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_ENGINECFG_ENGINE: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mEngineDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINECFG_KNOB_CHOICES: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR || requestedElementCount >= 0) + { + *elementCount = 0; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINECFG_INTERMEDIATE_INFO: MIOPEN_THROW(miopenStatusUnsupportedOp); + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +} // namespace graphapi + +} // namespace miopen diff --git a/src/graphapi/engineheur.cpp b/src/graphapi/engineheur.cpp new file mode 100644 index 0000000000..dc858a134d --- /dev/null +++ b/src/graphapi/engineheur.cpp @@ -0,0 +1,275 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace graphapi { + +EngineHeurBuilder& EngineHeurBuilder::setOpGraph(OpGraph* opGraph) +{ + mEngineHeur.mOpGraph = checkPtr(opGraph); + return *this; +} + +EngineHeurBuilder& EngineHeurBuilder::setMode(miopenBackendHeurMode_t mode) +{ + mEngineHeur.mMode = mode; + mModeSet = true; + return *this; +} + +EngineHeurBuilder& EngineHeurBuilder::setSmCount(int32_t smCount) +{ + mEngineHeur.mSmCount = smCount; + return *this; +} + +EngineHeur EngineHeurBuilder::build() +{ + if(mEngineHeur.mOpGraph == nullptr || !mModeSet) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + EngineHeur engineHeur(mEngineHeur); + + /* TODO: find solutions + * For now: just copy + * solutions from OpGraph + */ + const auto& engines = engineHeur.mOpGraph->getEngines(); + std::for_each(engines.begin(), engines.end(), [&engineHeur](const Engine& engine) { + engineHeur.mResults.emplace_back(engine); + }); + + return engineHeur; +} + +void BackendEngineHeurDescriptor::setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_ENGINEHEUR_MODE: + if(attributeType == MIOPEN_TYPE_HEUR_MODE && elementCount == 1) + { + mBuilder.setMode(*static_cast(arrayOfElements)); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINEHEUR_OPERATION_GRAPH: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && elementCount == 1) + { + miopenBackendDescriptor_t apiDescriptor = + deref(static_cast(arrayOfElements)); + BackendDescriptor& backendDescriptor = deref(apiDescriptor); + + if(!backendDescriptor.isFinalized()) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + BackendOperationGraphDescriptor& opGraphDescriptor = + dynamic_cast(backendDescriptor); + mBuilder.setOpGraph(opGraphDescriptor.getOperationGraph()); + mOpGraphDescriptor = apiDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINEHEUR_SM_COUNT_TARGET: + if(attributeType == MIOPEN_TYPE_INT32 && elementCount == 1) + { + mBuilder.setSmCount(*static_cast(arrayOfElements)); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendEngineHeurDescriptor::finalize() +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + mEngineHeur = mBuilder.build(); + + auto& engineCfgs = mEngineHeur.getResults(); + mResults.reserve(engineCfgs.size()); + + std::for_each(engineCfgs.begin(), engineCfgs.end(), [this](EngineCfg& engineCfg) { + mResults.emplace_back(std::move(engineCfg), mOpGraphDescriptor); + }); + + engineCfgs.clear(); + + mFinalized = true; +} + +void BackendEngineHeurDescriptor::getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) +{ + if(!mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_ENGINEHEUR_MODE: + if(attributeType == MIOPEN_TYPE_HEUR_MODE && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mEngineHeur.getMode(); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINEHEUR_OPERATION_GRAPH: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mOpGraphDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINEHEUR_RESULTS: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && requestedElementCount >= 0) + { + *elementCount = mResults.size(); + std::transform(mResults.begin(), + mResults.begin() + std::min(*elementCount, requestedElementCount), + static_cast(arrayOfElements), + [](auto& descriptor) { return &descriptor; }); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_ENGINEHEUR_SM_COUNT_TARGET: + if(attributeType == MIOPEN_TYPE_INT32 && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mEngineHeur.getSmCount(); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +BackendEngineHeurDescriptor::OwnedEngineCfgDescriptor::OwnedEngineCfgDescriptor( + EngineCfg&& engineCfg, miopenBackendDescriptor_t opGraphDescriptor) + : BackendEngineCfgDescriptor(std::move(engineCfg), &mOwnedEngineDescriptorInstance), + mOwnedEngineDescriptorInstance(getEngineCfg().getEngine(), opGraphDescriptor) +{ +} + +BackendEngineHeurDescriptor::OwnedEngineCfgDescriptor::OwnedEngineCfgDescriptor( + const OwnedEngineCfgDescriptor& other) + : BackendEngineCfgDescriptor(other.getEngineCfg(), &mOwnedEngineDescriptorInstance), + mOwnedEngineDescriptorInstance(other.mOwnedEngineDescriptorInstance) +{ +} + +BackendEngineHeurDescriptor::OwnedEngineCfgDescriptor::OwnedEngineCfgDescriptor( + OwnedEngineCfgDescriptor&& other) noexcept + : BackendEngineCfgDescriptor(std::move(other.getEngineCfg()), &mOwnedEngineDescriptorInstance), + mOwnedEngineDescriptorInstance(std::move(other.mOwnedEngineDescriptorInstance)) +{ +} + +BackendEngineHeurDescriptor::OwnedEngineCfgDescriptor& +BackendEngineHeurDescriptor::OwnedEngineCfgDescriptor::operator=( + const OwnedEngineCfgDescriptor& other) +{ + if(this != &other) + { + BackendEngineCfgDescriptor::operator=(other); + mEngineDescriptor = &mOwnedEngineDescriptorInstance; + mOwnedEngineDescriptorInstance = other.mOwnedEngineDescriptorInstance; + } + return *this; +} + +BackendEngineHeurDescriptor::OwnedEngineCfgDescriptor& +BackendEngineHeurDescriptor::OwnedEngineCfgDescriptor::operator=( + OwnedEngineCfgDescriptor&& other) noexcept +{ + if(this != &other) + { + BackendEngineCfgDescriptor::operator=(std::move(other)); + mEngineDescriptor = &mOwnedEngineDescriptorInstance; + mOwnedEngineDescriptorInstance = std::move(other.mOwnedEngineDescriptorInstance); + } + return *this; +} + +} // namespace graphapi + +} // namespace miopen diff --git a/src/graphapi/execution_plan.cpp b/src/graphapi/execution_plan.cpp new file mode 100644 index 0000000000..9ccf457aec --- /dev/null +++ b/src/graphapi/execution_plan.cpp @@ -0,0 +1,273 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +namespace miopen { + +namespace graphapi { + +std::string ExecutionPlan::getJsonRepresentation() const +{ + // TODO: Implement ExecutionPlan::getJsonRepresentation + return {}; +} + +void ExecutionPlan::execute(const VariantPack& variantPack) {} + +ExecutionPlanBuilder& ExecutionPlanBuilder::setHandle(miopenHandle_t handle) & +{ + mExecutionPlan.mHandle = checkPtr(handle); + return *this; +} + +ExecutionPlanBuilder& ExecutionPlanBuilder::setEngineCfg(const EngineCfg& engineCfg) & +{ + mExecutionPlan.mEngineCfg = engineCfg; + mEngineCfgSet = true; + return *this; +} + +ExecutionPlanBuilder& ExecutionPlanBuilder::setEngineCfg(EngineCfg&& engineCfg) & +{ + mExecutionPlan.mEngineCfg = std::move(engineCfg); + mEngineCfgSet = true; + return *this; +} + +ExecutionPlanBuilder& ExecutionPlanBuilder::setIntermediateIds(const std::vector& ids) & +{ + mExecutionPlan.mIntermediateIds = ids; + return *this; +} + +ExecutionPlanBuilder& ExecutionPlanBuilder::setIntermediateIds(std::vector&& ids) & +{ + mExecutionPlan.mIntermediateIds = std::move(ids); + return *this; +} + +ExecutionPlanBuilder& ExecutionPlanBuilder::setJsonRepresentation(const std::string_view& s) & +{ + // TODO: Implement ExecutionPlanBuilder::setJsonRepresentation + (void)s; + return *this; +} + +ExecutionPlan ExecutionPlanBuilder::build() & +{ + if(mExecutionPlan.mHandle != nullptr && mEngineCfgSet) + { + return mExecutionPlan; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } +} + +ExecutionPlan ExecutionPlanBuilder::build() && +{ + if(mExecutionPlan.mHandle != nullptr && mEngineCfgSet) + { + return std::move(mExecutionPlan); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendExecutionPlanDescriptor::setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_EXECUTION_PLAN_HANDLE: + if(attributeType == MIOPEN_TYPE_HANDLE && elementCount == 1) + { + mBuilder.setHandle(*static_cast(arrayOfElements)); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && elementCount == 1) + { + miopenBackendDescriptor_t apiDescriptor = + deref(static_cast(arrayOfElements)); + BackendDescriptor& backendDescriptor = deref(apiDescriptor); + + if(!backendDescriptor.isFinalized()) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + BackendEngineCfgDescriptor& engineCfgDescriptor = + dynamic_cast(backendDescriptor); + mBuilder.setEngineCfg(engineCfgDescriptor.getEngineCfg()); + mEngineCfgDescriptor = apiDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS: + if(attributeType == MIOPEN_TYPE_INT64 && elementCount >= 0) + { + mBuilder.setIntermediateIds({static_cast(arrayOfElements), + static_cast(arrayOfElements) + elementCount}); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION: + if(attributeType == MIOPEN_TYPE_CHAR && elementCount > 0) + { + std::string_view s(static_cast(arrayOfElements), elementCount); + mBuilder.setJsonRepresentation(s); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendExecutionPlanDescriptor::finalize() +{ + if(mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + mExecutionPlan = std::move(mBuilder).build(); + mFinalized = true; +} + +void BackendExecutionPlanDescriptor::getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) +{ + if(!mFinalized) + { + MIOPEN_THROW(miopenStatusNotInitialized); + } + + switch(attributeName) + { + case MIOPEN_ATTR_EXECUTION_PLAN_HANDLE: + if(attributeType == MIOPEN_TYPE_HANDLE && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mExecutionPlan.getHandle(); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG: + if(attributeType == MIOPEN_TYPE_BACKEND_DESCRIPTOR && requestedElementCount == 1) + { + *elementCount = 1; + *static_cast(arrayOfElements) = mEngineCfgDescriptor; + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS: + if(attributeType == MIOPEN_TYPE_INT64 && requestedElementCount >= 0) + { + const auto& vec = mExecutionPlan.getIntermediateIds(); + *elementCount = vec.size(); + std::copy_n(vec.begin(), + // WORKAROUND: building on Windows is failing due to conflicting definitions + // of std::min() between the MSVC standard library and HIP Clang wrappers. + requestedElementCount < *elementCount ? requestedElementCount + : *elementCount, + static_cast(arrayOfElements)); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + case MIOPEN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION: + if(attributeType == MIOPEN_TYPE_CHAR && requestedElementCount > 0) + { + std::string s = mExecutionPlan.getJsonRepresentation(); + *elementCount = s.size() + 1; + std::copy_n(s.c_str(), + // WORKAROUND: building on Windows is failing due to conflicting definitions + // of std::min() between the MSVC standard library and HIP Clang wrappers. + requestedElementCount < *elementCount ? requestedElementCount + : *elementCount, + static_cast(arrayOfElements)); + } + else + { + MIOPEN_THROW(miopenStatusBadParm); + } + break; + + default: MIOPEN_THROW(miopenStatusBadParm); + } +} + +void BackendExecutionPlanDescriptor::execute(miopenHandle_t handle, + miopenBackendDescriptor_t variantPack) +{ + // TODO: Implement BackendExecutionPlanDescriptor::execute +} + +} // namespace graphapi + +} // namespace miopen diff --git a/src/graphapi/graphapi.cpp b/src/graphapi/graphapi.cpp index 534f4d25ac..780bf2ac01 100644 --- a/src/graphapi/graphapi.cpp +++ b/src/graphapi/graphapi.cpp @@ -25,6 +25,9 @@ *******************************************************************************/ #include #include +#include +#include +#include #include #include #include @@ -56,6 +59,19 @@ miopenBackendCreateDescriptor(miopenBackendDescriptorType_t descriptorType, case MIOPEN_BACKEND_CONVOLUTION_DESCRIPTOR: outputDescriptor = new miopen::graphapi::BackendConvolutionDescriptor(); break; + case MIOPEN_BACKEND_ENGINE_DESCRIPTOR: + outputDescriptor = new miopen::graphapi::BackendEngineDescriptor(); break; + + case MIOPEN_BACKEND_ENGINECFG_DESCRIPTOR: + outputDescriptor = new miopen::graphapi::BackendEngineCfgDescriptor(); break; + + case MIOPEN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: + outputDescriptor = new miopen::graphapi::BackendExecutionPlanDescriptor(); break; + + case MIOPEN_BACKEND_MATMUL_DESCRIPTOR: + outputDescriptor = new miopen::graphapi::BackendMatmulDescriptor(); + break; + case MIOPEN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR: outputDescriptor = new miopen::graphapi::BackendOperationConvolutionForwardDescriptor(); break; @@ -65,6 +81,10 @@ miopenBackendCreateDescriptor(miopenBackendDescriptorType_t descriptorType, case MIOPEN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR: outputDescriptor = new miopen::graphapi::BackendOperationConvolutionBackwardDataDescriptor(); break; + case MIOPEN_BACKEND_OPERATION_MATMUL_DESCRIPTOR: + outputDescriptor = new miopen::graphapi::BackendOperationMatmulDescriptor(); + break; + case MIOPEN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR: outputDescriptor = new miopen::graphapi::BackendOperationPointwiseDescriptor(); break; @@ -92,14 +112,6 @@ miopenBackendCreateDescriptor(miopenBackendDescriptorType_t descriptorType, case MIOPEN_BACKEND_VARIANT_PACK_DESCRIPTOR: outputDescriptor = new miopen::graphapi::BackendVariantPackDescriptor(); break; - case MIOPEN_BACKEND_MATMUL_DESCRIPTOR: - outputDescriptor = new miopen::graphapi::BackendMatmulDescriptor(); - break; - - case MIOPEN_BACKEND_OPERATION_MATMUL_DESCRIPTOR: - outputDescriptor = new miopen::graphapi::BackendOperationMatmulDescriptor(); - break; - default: MIOPEN_THROW(miopenStatusUnsupportedOp); // clang-format on } @@ -215,6 +227,18 @@ extern "C" miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t desc case MIOPEN_BACKEND_CONVOLUTION_DESCRIPTOR: initializeBackendDescriptor(descriptor, sizeInBytes); break; + case MIOPEN_BACKEND_ENGINE_DESCRIPTOR: + initializeBackendDescriptor(descriptor, sizeInBytes); break; + + case MIOPEN_BACKEND_ENGINECFG_DESCRIPTOR: + initializeBackendDescriptor(descriptor, sizeInBytes); break; + + case MIOPEN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: + initializeBackendDescriptor(descriptor, sizeInBytes); break; + + case MIOPEN_BACKEND_MATMUL_DESCRIPTOR: + initializeBackendDescriptor(descriptor, sizeInBytes); break; + case MIOPEN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR: initializeBackendDescriptor(descriptor, sizeInBytes); break; @@ -251,9 +275,6 @@ extern "C" miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t desc case MIOPEN_BACKEND_TENSOR_DESCRIPTOR: initializeBackendDescriptor(descriptor, sizeInBytes); break; - case MIOPEN_BACKEND_MATMUL_DESCRIPTOR: - initializeBackendDescriptor(descriptor, sizeInBytes); break; - case MIOPEN_BACKEND_VARIANT_PACK_DESCRIPTOR: initializeBackendDescriptor(descriptor, sizeInBytes); break; diff --git a/src/graphapi/opgraph.cpp b/src/graphapi/opgraph.cpp index 92e04683ae..50aa7cf36e 100644 --- a/src/graphapi/opgraph.cpp +++ b/src/graphapi/opgraph.cpp @@ -422,7 +422,10 @@ void BackendOperationGraphDescriptor::getAttribute(miopenBackendAttributeName_t { *elementCount = mOps.size(); std::copy_n(mOps.cbegin(), - std::min(*elementCount, requestedElementCount), + // WORKAROUND: building on Windows is failing due to conflicting definitions + // of std::min() between the MSVC standard library and HIP Clang wrappers. + *elementCount < requestedElementCount ? *elementCount + : requestedElementCount, static_cast(arrayOfElements)); } else diff --git a/src/hip/handlehip.cpp b/src/hip/handlehip.cpp index d569ec8ae3..555528c4f8 100644 --- a/src/hip/handlehip.cpp +++ b/src/hip/handlehip.cpp @@ -476,13 +476,13 @@ const std::vector& Handle::GetKernelsImpl(const std::string& algorithm, return this->impl->cache.GetKernels(algorithm, network_config); } -KernelInvoke Handle::Run(Kernel k) const +KernelInvoke Handle::Run(Kernel k, bool coop_launch) const { this->impl->set_ctx(); - if(this->impl->enable_profiling || MIOPEN_GPU_SYNC) - return k.Invoke(this->GetStream(), this->impl->elapsed_time_handler()); - else - return k.Invoke(this->GetStream()); + auto callback = (this->impl->enable_profiling || MIOPEN_GPU_SYNC) + ? this->impl->elapsed_time_handler() + : nullptr; + return k.Invoke(this->GetStream(), callback, coop_launch); } Program Handle::LoadProgram(const std::string& program_name, @@ -675,6 +675,17 @@ std::size_t Handle::GetMaxMemoryAllocSize() return m_MaxMemoryAllocSizeCached; } +bool Handle::CooperativeLaunchSupported() const +{ + int result; + auto status = + hipDeviceGetAttribute(&result, hipDeviceAttributeCooperativeLaunch, this->impl->device); + if(status != hipSuccess) + MIOPEN_THROW_HIP_STATUS(status); + + return result == 1; +} + std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); } std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); } diff --git a/src/hip/hip_build_utils.cpp b/src/hip/hip_build_utils.cpp index f11c4f5351..307b035c1a 100644 --- a/src/hip/hip_build_utils.cpp +++ b/src/hip/hip_build_utils.cpp @@ -183,7 +183,8 @@ static fs::path HipBuildImpl(boost::optional& tmp_dir, #endif tmp_dir->Execute(MIOPEN_HIP_COMPILER, args); if(!fs::exists(bin_file)) - MIOPEN_THROW("Failed cmd: '" MIOPEN_HIP_COMPILER "', args: '" + args + '\''); + MIOPEN_THROW("Failed cmd: '" + std::string(MIOPEN_HIP_COMPILER) + "', args: '" + args + + '\''); } #if defined(MIOPEN_OFFLOADBUNDLER_BIN) && !MIOPEN_BACKEND_HIP diff --git a/src/hipoc/hipoc_kernel.cpp b/src/hipoc/hipoc_kernel.cpp index 1b72ebfc7c..4d9631ef61 100644 --- a/src/hipoc/hipoc_kernel.cpp +++ b/src/hipoc/hipoc_kernel.cpp @@ -36,6 +36,8 @@ #include #include +#define WORKAROUND_SWDEV_448157 1 + MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_DEVICE_ARCH) namespace miopen { @@ -123,9 +125,83 @@ void HIPOCKernelInvoke::run(void* args, std::size_t size) const } } +void HIPOCKernelInvoke::run_cooperative(void** kern_args) const +{ + hipError_t status; + + MIOPEN_LOG_I2("kernel_name = " + << GetName() << ", global_work_dim = " << DimToFormattedString(gdims.data(), 3) + << ", local_work_dim = " << DimToFormattedString(ldims.data(), 3)); + + const auto& arch = miopen::GetStringEnv(ENV(MIOPEN_DEVICE_ARCH)); + if(!arch.empty()) + { + MIOPEN_THROW("MIOPEN_DEVICE_ARCH used, escaping launching kernel"); + } + + HipEventPtr start = nullptr; + HipEventPtr stop = nullptr; + + if(callback) + { + start = make_hip_event(); + stop = make_hip_event(); + } + +#if WORKAROUND_SWDEV_448157 + if(gdims[0] >= (1ULL << 32) || gdims[1] >= (1ULL << 32) || gdims[2] >= (1ULL << 32)) + MIOPEN_THROW("gridDim x blockDim >= 2^32"); + + if(gdims[0] % ldims[0] != 0 || gdims[1] % ldims[1] != 0 || gdims[2] % ldims[2] != 0) + MIOPEN_THROW(miopenStatusInternalError); + + unsigned grid_dim_x = gdims[0] / ldims[0]; + unsigned grid_dim_y = gdims[1] / ldims[1]; + unsigned grid_dim_z = gdims[2] / ldims[2]; + + MIOPEN_HANDLE_LOCK + + if(callback) + { + status = hipEventRecord(start.get(), stream); + if(status != hipSuccess) + MIOPEN_THROW_HIP_STATUS(status, "hipEventRecord() failed"); + } + + status = hipModuleLaunchCooperativeKernel(fun, + grid_dim_x, + grid_dim_y, + grid_dim_z, + ldims[0], + ldims[1], + ldims[2], + 0, + stream, + kern_args); + if(status != hipSuccess) + MIOPEN_THROW_HIP_STATUS(status, "Failed to launch kernel"); + + if(callback) + { + status = hipEventRecord(stop.get(), stream); + if(status != hipSuccess) + MIOPEN_THROW_HIP_STATUS(status, "hipEventRecord() failed"); + } +#else +#error "Doesn't work without workaround" +#endif // WORKAROUND_SWDEV_448157 + + if(callback) + { + hipEventSynchronize(stop.get()); + callback(start.get(), stop.get()); + } +} + HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream, - std::function callback) const + std::function callback, + bool coop_launch) const { - return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback}; + return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback, coop_launch}; } } // namespace miopen diff --git a/src/include/miopen/adam.hpp b/src/include/miopen/adam.hpp new file mode 100644 index 0000000000..1e6b8085f5 --- /dev/null +++ b/src/include/miopen/adam.hpp @@ -0,0 +1,77 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_ADAM_HPP_ +#define MIOPEN_ADAM_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +miopenStatus_t Adam(Handle& handle, + const TensorDescriptor& paramInDesc, + ConstData_t paramIn, + const TensorDescriptor& paramOutDesc, + Data_t paramOut, + const TensorDescriptor& paramOutFloat16Desc, + Data_t paramOutFloat16, + const TensorDescriptor& gradInDesc, + ConstData_t gradIn, + const TensorDescriptor& expAvgInDesc, + ConstData_t expAvgIn, + const TensorDescriptor& expAvgOutDesc, + Data_t expAvgOut, + const TensorDescriptor& expAvgSqInDesc, + ConstData_t expAvgSqIn, + const TensorDescriptor& expAvgSqOutDesc, + Data_t expAvgSqOut, + const TensorDescriptor& maxExpAvgSqInDesc, + ConstData_t maxExpAvgSqIn, + const TensorDescriptor& maxExpAvgSqOutDesc, + Data_t maxExpAvgSqOut, + const TensorDescriptor& gradScaleDescPtr, + ConstData_t gradScale, + const TensorDescriptor& foundInfDescPtr, + ConstData_t foundInf, + const TensorDescriptor& stepInDesc, + ConstData_t stepIn, + const TensorDescriptor& stepOutDesc, + Data_t stepOut, + uint32_t step, + float lr, + float beta1, + float beta2, + float weight_decay, + float eps, + bool amsgrad, + bool maximize, + bool adamw, + bool is_amp); + +} // namespace miopen +#endif // _MIOPEN_ADAM_HPP_ diff --git a/src/include/miopen/adam/invoke_params.hpp b/src/include/miopen/adam/invoke_params.hpp new file mode 100644 index 0000000000..ff94f37287 --- /dev/null +++ b/src/include/miopen/adam/invoke_params.hpp @@ -0,0 +1,72 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +namespace miopen { +namespace adam { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams() = default; + + const TensorDescriptor* paramDesc = nullptr; + const TensorDescriptor* gradDesc = nullptr; + + ConstData_t paramIn = nullptr; + Data_t paramOut = nullptr; + Data_t paramOutFloat16 = nullptr; + ConstData_t gradIn = nullptr; + ConstData_t expAvgIn = nullptr; + Data_t expAvgOut = nullptr; + ConstData_t expAvgSqIn = nullptr; + Data_t expAvgSqOut = nullptr; + ConstData_t maxExpAvgSqIn = nullptr; + Data_t maxExpAvgSqOut = nullptr; + ConstData_t gradScale = nullptr; + ConstData_t foundInf = nullptr; + ConstData_t stepIn = nullptr; + Data_t stepOut = nullptr; + + uint32_t step = 0; + float lr = 0.0; + float beta1 = 0.0; + float beta2 = 0.0; + float weight_decay = 0.0; + float eps = 0.0; + bool amsgrad = false; + bool maximize = false; + bool adamw = false; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +} // namespace adam +} // namespace miopen diff --git a/src/include/miopen/adam/problem_description.hpp b/src/include/miopen/adam/problem_description.hpp new file mode 100644 index 0000000000..d9cc94465d --- /dev/null +++ b/src/include/miopen/adam/problem_description.hpp @@ -0,0 +1,183 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +#include + +namespace miopen { + +struct NetworkConfig; + +namespace adam { + +struct ProblemDescription : ProblemDescriptionBase +{ + ProblemDescription(const TensorDescriptor& paramInDesc_, + const TensorDescriptor& paramOutDesc_, + const TensorDescriptor& paramOutFloat16Desc_, + const TensorDescriptor& gradInDesc_, + const TensorDescriptor& expAvgInDesc_, + const TensorDescriptor& expAvgOutDesc_, + const TensorDescriptor& expAvgSqInDesc_, + const TensorDescriptor& expAvgSqOutDesc_, + const TensorDescriptor& maxExpAvgSqInDesc_, + const TensorDescriptor& maxExpAvgSqOutDesc_, + const TensorDescriptor& gradScaleDesc_, + const TensorDescriptor& foundInfDesc_, + const TensorDescriptor& stepInDesc_, + const TensorDescriptor& stepOutDesc_, + uint32_t step_, + double lr_, + double beta1_, + double beta2_, + double weight_decay_, + double eps_, + bool amsgrad_, + bool maximize_, + bool adamw_, + bool is_amp_) + : paramInDesc(paramInDesc_), + paramOutDesc(paramOutDesc_), + gradInDesc(gradInDesc_), + expAvgInDesc(expAvgInDesc_), + expAvgOutDesc(expAvgOutDesc_), + expAvgSqInDesc(expAvgSqInDesc_), + expAvgSqOutDesc(expAvgSqOutDesc_), + paramOutFloat16Desc(paramOutFloat16Desc_), + maxExpAvgSqInDesc(maxExpAvgSqInDesc_), + maxExpAvgSqOutDesc(maxExpAvgSqOutDesc_), + gradScaleDesc(gradScaleDesc_), + foundInfDesc(foundInfDesc_), + stepInDesc(stepInDesc_), + stepOutDesc(stepOutDesc_), + step(step_), + lr(lr_), + beta1(beta1_), + beta2(beta2_), + weight_decay(weight_decay_), + eps(eps_), + amsgrad(amsgrad_), + maximize(maximize_), + adamw(adamw_), + is_amp(is_amp_) + { + if(amsgrad && + (maxExpAvgSqInDesc.GetLengths().empty() || maxExpAvgSqOutDesc.GetLengths().empty())) + { + MIOPEN_THROW(miopenStatusBadParm, + "Adam: In the amsgrad, the max_exp_avg_sq tensor is required."); + } + + auto dtype = paramInDesc.GetType(); + + if((dtype == miopenBFloat16) || (gradInDesc.GetType() == miopenBFloat16)) + { + MIOPEN_THROW(miopenStatusBadParm, "Adam: bfloat16 type is not supported."); + } + + if((paramOutDesc.GetType() != dtype) || (!is_amp && gradInDesc.GetType() != dtype) || + (expAvgInDesc.GetType() != dtype) || (expAvgOutDesc.GetType() != dtype) || + (expAvgSqInDesc.GetType() != dtype) || (expAvgSqOutDesc.GetType() != dtype) || + (!maxExpAvgSqInDesc.GetLengths().empty() && maxExpAvgSqInDesc.GetType() != dtype) || + (!maxExpAvgSqOutDesc.GetLengths().empty() && maxExpAvgSqOutDesc.GetType() != dtype)) + { + MIOPEN_THROW(miopenStatusBadParm, "Adam: Tensor types do not match."); + } + + if(is_amp && !paramOutFloat16Desc.GetLengths().empty() && + (paramOutFloat16Desc.GetType() != miopenHalf)) + { + MIOPEN_THROW(miopenStatusBadParm, "Adam: Invalid type of param_out_float16."); + } + + auto numel = paramInDesc.GetElementSize(); + if((paramOutDesc.GetElementSize() != numel) || (gradInDesc.GetElementSize() != numel) || + (expAvgInDesc.GetElementSize() != numel) || (expAvgOutDesc.GetElementSize() != numel) || + (expAvgSqInDesc.GetElementSize() != numel) || + (expAvgSqOutDesc.GetElementSize() != numel) || + (is_amp && !paramOutFloat16Desc.GetLengths().empty() && + paramOutFloat16Desc.GetElementSize() != numel) || + (!maxExpAvgSqInDesc.GetLengths().empty() && + maxExpAvgSqInDesc.GetElementSize() != numel) || + (!maxExpAvgSqOutDesc.GetLengths().empty() && + maxExpAvgSqOutDesc.GetElementSize() != numel)) + { + MIOPEN_THROW(miopenStatusBadParm, "Adam: Tensor dimension lengths do not match."); + } + } + + const TensorDescriptor& GetParamDesc() const { return paramInDesc; } + const TensorDescriptor& GetGradDesc() const { return gradInDesc; } + bool ExistStepTensor() const { return !stepInDesc.GetLengths().empty(); } + bool IsAmp() const { return is_amp; } + bool IsAdamW() const { return adamw; } + bool IsAllPacked() const + { + if(!(paramInDesc.IsPacked() && gradInDesc.IsPacked() && expAvgInDesc.IsPacked() && + expAvgSqInDesc.IsPacked())) + return false; + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor paramInDesc; + TensorDescriptor paramOutDesc; + TensorDescriptor gradInDesc; + TensorDescriptor expAvgInDesc; + TensorDescriptor expAvgOutDesc; + TensorDescriptor expAvgSqInDesc; + TensorDescriptor expAvgSqOutDesc; + TensorDescriptor paramOutFloat16Desc; + TensorDescriptor maxExpAvgSqInDesc; + TensorDescriptor maxExpAvgSqOutDesc; + TensorDescriptor gradScaleDesc; + TensorDescriptor foundInfDesc; + TensorDescriptor stepInDesc; + TensorDescriptor stepOutDesc; + + uint32_t step = 0; + double lr = 0.0; + double beta1 = 0.0; + double beta2 = 0.0; + double weight_decay = 0.0; + double eps = 0.0; + bool amsgrad = false; + bool maximize = false; + bool adamw = false; + bool is_amp = false; + + NetworkConfig MakeForwardNetworkConfig() const; +}; + +} // namespace adam + +} // namespace miopen diff --git a/src/include/miopen/adam/solvers.hpp b/src/include/miopen/adam/solvers.hpp new file mode 100644 index 0000000000..ec426751c1 --- /dev/null +++ b/src/include/miopen/adam/solvers.hpp @@ -0,0 +1,63 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +#include + +namespace miopen { + +namespace solver { + +namespace adam { + +using AdamSolver = NonTunableSolverBase; + +struct Adam final : AdamSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::adam::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::adam::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::adam::ProblemDescription& problem) const override + { + return 0; + } + bool MayNeedWorkspace() const override { return false; } +}; + +} // namespace adam + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/addlayernorm.hpp b/src/include/miopen/addlayernorm.hpp new file mode 100644 index 0000000000..ec7550ee06 --- /dev/null +++ b/src/include/miopen/addlayernorm.hpp @@ -0,0 +1,56 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_ADDLAYERNORM_HPP_ +#define MIOPEN_ADDLAYERNORM_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +miopenStatus_t AddLayerNormForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& x2Desc, + ConstData_t x2, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& biasDesc, + ConstData_t bias, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& meanDesc, + Data_t mean, + const TensorDescriptor& rstdDesc, + Data_t rstd, + miopenNormMode_t mode, + float epsilon, + int32_t normalized_dim); + +} // namespace miopen +#endif // MIOPEN_ADDLAYERNORM_HPP_ diff --git a/src/include/miopen/conv/invokers/gcn_asm_wino.hpp b/src/include/miopen/conv/invokers/gcn_asm_wino.hpp new file mode 100644 index 0000000000..48ea4bb751 --- /dev/null +++ b/src/include/miopen/conv/invokers/gcn_asm_wino.hpp @@ -0,0 +1,42 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +namespace miopen { +namespace conv { + +enum class Direction; + +InvokerFactory MakeGcnAsmWinoV2InvokerFactory(const WinoShaderArgsV2& args, + Direction direction, + std::size_t sync_buffer_size); + +} // namespace conv +} // namespace miopen diff --git a/src/include/miopen/conv/kernel_interface/winograd_kernel_interface.hpp b/src/include/miopen/conv/kernel_interface/winograd_kernel_interface.hpp new file mode 100644 index 0000000000..411681d544 --- /dev/null +++ b/src/include/miopen/conv/kernel_interface/winograd_kernel_interface.hpp @@ -0,0 +1,144 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace miopen { +namespace conv { + +struct ProblemDescription; + +enum class WinoShaderFlagsV2 : uint64_t +{ + F_REVERSE_R = 1ULL << 0, + F_REVERSE_S = 1ULL << 1, + F_FLIP_K_C = 1ULL << 2, // Deprecated + F_DENORMS_RND_ENABLE = 1ULL << 3, + F_MALL_READ_CACHE_ENABLE = 1ULL << 4, + F_ACC_PRE_ACTIVATION_MODE = 1ULL << 5, + F_ADDR_INDIRECT = 1ULL << 6, + F_BIAS = 1ULL << 7, + F_LEAKY_RELU = 1ULL << 8, // Deprecated + F_NKCHR_STRIDES = 1ULL << 9, + F_GROUPED_CONVOLUTION = 1ULL << 10, + F_FORCE_FILTER_TRAVERSE_MODE = 1ULL << 11, + F_FILTER_TRAVERSE_DUAL = 1ULL << 12, + F_TENSOR_OFFSETS = 1ULL << 13, + F_USE_ACTIVATION_MODE = 1ULL << 14, + F_USE_EXTENDED_FLAGS_64 = 1ULL << 15, +}; + +inline WinoShaderFlagsV2 operator|(WinoShaderFlagsV2 lhs, WinoShaderFlagsV2 rhs) +{ + using T = std::underlying_type_t; + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +inline WinoShaderFlagsV2 operator|=(WinoShaderFlagsV2& lhs, WinoShaderFlagsV2 rhs) +{ + lhs = lhs | rhs; + return lhs; +} + +inline std::ostream& operator<<(std::ostream& s, WinoShaderFlagsV2 flags) +{ + using T = std::underlying_type_t; + s << "0x" << std::hex << static_cast(flags) << std::dec; + return s; +} + +enum class WinoShaderActivationModeV2_t : uint8_t +{ + IDENTITY = 0, // no activation, alpha and beta are ignored + LEAKY_RELU = 1, // ReLU, beta field is ignored + SIGMOID = 2, // sigmoid, alpha and beta fields are ignored + SCALED_TANH = 3, // parametric tanh function +}; + +inline std::ostream& operator<<(std::ostream& s, const WinoShaderActivationModeV2_t& mode) +{ + s << static_cast(mode); + return s; +} + +struct WinoShaderArgsV2 +{ + // Main convolution parameters + uint32_t N; // batch size + uint32_t C; // number of input channels in each filter group + uint32_t H; // input height + uint32_t W; // input width + uint32_t K; // number of output channels in each filter group + uint32_t R; // filter height + uint32_t S; // filter width + int32_t pad_h; // padding in h dimension + int32_t pad_w; // padding in w dimension + uint32_t out_h; // output height + uint32_t out_w; // output width + uint32_t G; // number of filter groups + + // Data layout related parameters + uint32_t d_N_stride; // stride in number of elements of the N dimension of the input data buffer + uint32_t d_C_stride; // stride in number of elements of the C dimension of the input data buffer + uint32_t d_H_stride; // stride in number of elements of the H dimension of the input data buffer + uint32_t d_G_stride; // stride in number of elements of the G dimension of the input data buffer + + uint32_t f_K_stride; // stride in number of elements of the K dimension of the filter buffer + uint32_t f_C_stride; // stride in number of elements of the C dimension of the filter buffer + uint32_t f_R_stride; // stride in number of elements of the R dimension of the filter buffer + uint32_t f_G_stride; // stride in number of elements of the G dimension of the filter buffer + + uint32_t o_N_stride; // stride in number of elements of the N dimension of the output buffer + uint32_t o_K_stride; // stride in number of elements of the K dimension of the output buffer + uint32_t o_H_stride; // stride in number of elements of the H dimension of the output buffer + uint32_t o_G_stride; // stride in number of elements of the G dimension of the output buffer + + // Fused activation parameters + float alpha; // activation parameter alpha + float beta; // activation parameter beta + WinoShaderActivationModeV2_t activation_mode; // activation mode + + // Other shader parameters + uint32_t n_groups; // number of shader groups + WinoShaderFlagsV2 flags64; // shader flags + uint8_t sync_limit; // maximum number of sync attempts + uint8_t sync_period; // synchronization period + + bool SetConvParams(const ProblemDescription& problem); + void SetStrides(const ProblemDescription& problem); + void SetActivParams(WinoShaderActivationModeV2_t mode, float alpha, float beta) noexcept; + void SetShaderParams(uint32_t n_groups, + WinoShaderFlagsV2 flags, + uint8_t sync_limit, + uint8_t sync_period) noexcept; +}; + +} // namespace conv +} // namespace miopen diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp index 8bba1ba3c5..00f0bccb21 100644 --- a/src/include/miopen/conv/problem_description.hpp +++ b/src/include/miopen/conv/problem_description.hpp @@ -227,12 +227,11 @@ struct ProblemDescription : ProblemDescriptionBase else return GetW5(GetSpatialDims(), weights.GetLengths()); } - // std::size_t GetWeightsStrideD() const { return GetD5(GetSpatialDims(), weights.GetStrides()); - // } - // std::size_t GetWeightsStrideH() const { return GetH5(GetSpatialDims(), weights.GetStrides()); - // } - // std::size_t GetWeightsStrideW() const { return GetW5(GetSpatialDims(), weights.GetStrides()); - // } + std::size_t GetWeightsStrideK() const { return GetN5(GetSpatialDims(), weights.GetStrides()); } + std::size_t GetWeightsStrideC() const { return GetC5(GetSpatialDims(), weights.GetStrides()); } + std::size_t GetWeightsStrideD() const { return GetD5(GetSpatialDims(), weights.GetStrides()); } + std::size_t GetWeightsStrideH() const { return GetH5(GetSpatialDims(), weights.GetStrides()); } + std::size_t GetWeightsStrideW() const { return GetW5(GetSpatialDims(), weights.GetStrides()); } std::string GetWeightsLayout() const { return weights_layout; } std::size_t GetWeightsElementSize() const { return GetTypeSize(GetWeightsDataType()); } std::size_t GetWeightsSize() const { return weights.GetNumBytes(); } diff --git a/src/include/miopen/graphapi/engine.hpp b/src/include/miopen/graphapi/engine.hpp index d077b86442..7821703a5a 100644 --- a/src/include/miopen/graphapi/engine.hpp +++ b/src/include/miopen/graphapi/engine.hpp @@ -37,6 +37,8 @@ class Engine { private: Solution mSolution; + int64_t mGlobalIndex = -1; + int32_t mSmCount = 0; friend class EngineBuilder; public: @@ -49,8 +51,58 @@ class Engine Engine(const Solution& solution) : mSolution(solution) {} Engine(Solution&& solution) : mSolution(std::move(solution)) {} - const Solution& getSolution() const { return mSolution; } - Solution& getSolution() { return mSolution; } + const Solution& getSolution() const noexcept { return mSolution; } + Solution& getSolution() noexcept { return mSolution; } + + int64_t getGlobalIndex() const noexcept { return mGlobalIndex; } + int32_t getSmCount() const noexcept { return mSmCount; } +}; + +class OpGraph; + +class EngineBuilder +{ +private: + const OpGraph* mOpGraph = nullptr; + int64_t mGlobalIndex = -1; + int32_t mSmCount = 0; + bool mGlobalIndexSet = false; + +public: + EngineBuilder& setOpGraph(const OpGraph* opGraph); + EngineBuilder& setGlobalIndex(int64_t globalIndex); + EngineBuilder& setSmCount(int32_t smCount); + Engine build(); +}; + +class BackendEngineDescriptor : public BackendDescriptor +{ +private: + EngineBuilder mBuilder; + Engine mEngine; + + miopenBackendDescriptor_t mOpGraphDescriptor = nullptr; + +public: + BackendEngineDescriptor() = default; + BackendEngineDescriptor(const Engine& engine, miopenBackendDescriptor_t opGraphDescriptor) + : mEngine(engine), mOpGraphDescriptor(opGraphDescriptor) + { + } + + void setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) override; + void finalize() override; + void getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) override; + + const Engine& getEngine() const noexcept { return mEngine; } + Engine& getEngine() noexcept { return mEngine; } }; } // namespace graphapi diff --git a/src/include/miopen/graphapi/enginecfg.hpp b/src/include/miopen/graphapi/enginecfg.hpp new file mode 100644 index 0000000000..fa8b7359ca --- /dev/null +++ b/src/include/miopen/graphapi/enginecfg.hpp @@ -0,0 +1,131 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include + +namespace miopen { + +namespace graphapi { + +class EngineCfg +{ +private: + /* we don't use a pointer here to allow a user + * to have several configs for an Engine. Each + * config might modify its Engine in future so + * their instance of Engine shouldn't be shared + */ + Engine mEngine; + + friend class EngineCfgBuilder; + +public: + EngineCfg() = default; + EngineCfg(const EngineCfg&) = default; + EngineCfg(EngineCfg&&) = default; + EngineCfg& operator=(const EngineCfg&) = default; + EngineCfg& operator=(EngineCfg&&) = default; + + EngineCfg(const Engine& engine) : mEngine(engine) {} + EngineCfg(Engine&& engine) : mEngine(std::move(engine)) {} + + const Engine& getEngine() const noexcept { return mEngine; } + Engine& getEngine() noexcept { return mEngine; } +}; + +/* For now we don't support tuning and a builder is not needed, + * but in future it will be needed. + */ +class EngineCfgBuilder +{ +private: + EngineCfg mEngineCfg; + bool mEngineSet = false; + +public: + EngineCfgBuilder& setEngine(const Engine& engine) & + { + mEngineCfg.mEngine = engine; + mEngineSet = true; + return *this; + } + EngineCfgBuilder& setEngine(Engine&& engine) & + { + mEngineCfg.mEngine = std::move(engine); + mEngineSet = true; + return *this; + } + EngineCfgBuilder&& setEngine(const Engine& engine) && { return std::move(setEngine(engine)); } + EngineCfgBuilder&& setEngine(Engine&& engine) && + { + return std::move(setEngine(std::move(engine))); + } + EngineCfg build() &; + EngineCfg build() &&; +}; + +class BackendEngineCfgDescriptor : public BackendDescriptor +{ +protected: + EngineCfgBuilder mBuilder; + EngineCfg mEngineCfg; + + miopenBackendDescriptor_t mEngineDescriptor = nullptr; + + BackendEngineCfgDescriptor(const EngineCfg& engineCfg, + miopenBackendDescriptor_t engineDescriptor) + : mEngineCfg(engineCfg), mEngineDescriptor(engineDescriptor) + { + mFinalized = true; + } + BackendEngineCfgDescriptor(EngineCfg&& engineCfg, miopenBackendDescriptor_t engineDescriptor) + : mEngineCfg(std::move(engineCfg)), mEngineDescriptor(engineDescriptor) + { + mFinalized = true; + } + +public: + BackendEngineCfgDescriptor() = default; + void setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) override; + void finalize() override; + void getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) override; + + const EngineCfg& getEngineCfg() const { return mEngineCfg; } + EngineCfg& getEngineCfg() { return mEngineCfg; } +}; + +} // namespace graphapi + +} // namespace miopen diff --git a/src/include/miopen/graphapi/engineheur.hpp b/src/include/miopen/graphapi/engineheur.hpp new file mode 100644 index 0000000000..437029a9e5 --- /dev/null +++ b/src/include/miopen/graphapi/engineheur.hpp @@ -0,0 +1,116 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace miopen { + +namespace graphapi { + +class EngineHeur +{ +private: + OpGraph* mOpGraph; + std::vector mResults; + int32_t mSmCount = 0; + miopenBackendHeurMode_t mMode = miopenBackendHeurMode_t(0); + + friend class EngineHeurBuilder; + +public: + EngineHeur() noexcept = default; + EngineHeur(const EngineHeur&) = default; + EngineHeur(EngineHeur&&) noexcept = default; + EngineHeur& operator=(const EngineHeur&) = default; + EngineHeur& operator=(EngineHeur&&) noexcept = default; + + OpGraph* getOpgraph() const noexcept { return mOpGraph; } + miopenBackendHeurMode_t getMode() const noexcept { return mMode; } + const std::vector& getResults() const noexcept { return mResults; } + std::vector& getResults() noexcept { return mResults; } + int32_t getSmCount() const noexcept { return mSmCount; } +}; + +class EngineHeurBuilder +{ +private: + EngineHeur mEngineHeur; + bool mModeSet = false; + +public: + EngineHeurBuilder& setOpGraph(OpGraph* opGraph); + EngineHeurBuilder& setMode(miopenBackendHeurMode_t mode); + EngineHeurBuilder& setSmCount(int32_t smCount); + EngineHeur build(); +}; + +class BackendEngineHeurDescriptor : public BackendDescriptor +{ +private: + EngineHeurBuilder mBuilder; + EngineHeur mEngineHeur; + + miopenBackendDescriptor_t mOpGraphDescriptor = nullptr; + + class OwnedEngineCfgDescriptor : public BackendEngineCfgDescriptor + { + private: + BackendEngineDescriptor mOwnedEngineDescriptorInstance; + + public: + OwnedEngineCfgDescriptor(EngineCfg&& engineCfg, + miopenBackendDescriptor_t opGraphDescriptor); + OwnedEngineCfgDescriptor(const OwnedEngineCfgDescriptor& other); + OwnedEngineCfgDescriptor(OwnedEngineCfgDescriptor&& other) noexcept; + OwnedEngineCfgDescriptor& operator=(const OwnedEngineCfgDescriptor& other); + OwnedEngineCfgDescriptor& operator=(OwnedEngineCfgDescriptor&& other) noexcept; + }; + + std::vector mResults; + +public: + void setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) override; + void finalize() override; + void getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) override; +}; + +} // namespace graphapi + +} // namespace miopen diff --git a/src/include/miopen/graphapi/execution_plan.hpp b/src/include/miopen/graphapi/execution_plan.hpp new file mode 100644 index 0000000000..c4661243af --- /dev/null +++ b/src/include/miopen/graphapi/execution_plan.hpp @@ -0,0 +1,138 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace miopen { + +namespace graphapi { + +class ExecutionPlan +{ +private: + /* we don't use a pointer for mEngineCfg + * because we need to support serialization + * and deserialization + */ + EngineCfg mEngineCfg; + miopenHandle_t mHandle = nullptr; + std::vector mIntermediateIds; + int64_t mWorkspaceSize = 0; + + friend class ExecutionPlanBuilder; + +public: + ExecutionPlan() = default; + ExecutionPlan(const ExecutionPlan&) = default; + ExecutionPlan(ExecutionPlan&&) = default; + ExecutionPlan& operator=(const ExecutionPlan&) = default; + ExecutionPlan& operator=(ExecutionPlan&&) = default; + + miopenHandle_t getHandle() const noexcept { return mHandle; } + const EngineCfg& getEngineCfg() const noexcept { return mEngineCfg; } + EngineCfg& getEngineCfg() noexcept { return mEngineCfg; } + const std::vector& getIntermediateIds() const noexcept { return mIntermediateIds; } + int64_t getWorkspaceSize() const { return mWorkspaceSize; } + std::string getJsonRepresentation() const; + + void execute(const VariantPack& variantPack); +}; + +class ExecutionPlanBuilder +{ +private: + ExecutionPlan mExecutionPlan; + bool mEngineCfgSet = false; + +public: + ExecutionPlanBuilder& setHandle(miopenHandle_t handle) &; + ExecutionPlanBuilder& setEngineCfg(const EngineCfg& engineCfg) &; + ExecutionPlanBuilder& setEngineCfg(EngineCfg&& engineCfg) &; + ExecutionPlanBuilder& setIntermediateIds(const std::vector& ids) &; + ExecutionPlanBuilder& setIntermediateIds(std::vector&& ids) &; + ExecutionPlanBuilder& setJsonRepresentation(const std::string_view& s) &; + + ExecutionPlanBuilder&& setHandle(miopenHandle_t handle) && + { + return std::move(setHandle(handle)); + } + ExecutionPlanBuilder&& setEngineCfg(const EngineCfg& engineCfg) && + { + return std::move(setEngineCfg(engineCfg)); + } + ExecutionPlanBuilder&& setEngineCfg(EngineCfg&& engineCfg) && + { + return std::move(setEngineCfg(std::move(engineCfg))); + } + ExecutionPlanBuilder&& setIntermediateIds(const std::vector& ids) && + { + return std::move(setIntermediateIds(ids)); + } + ExecutionPlanBuilder&& setIntermediateIds(std::vector&& ids) && + { + return std::move(setIntermediateIds(std::move(ids))); + } + ExecutionPlanBuilder&& setJsonRepresentation(const std::string_view& s) && + { + return std::move(setJsonRepresentation(s)); + } + + ExecutionPlan build() &; + ExecutionPlan build() &&; +}; + +class BackendExecutionPlanDescriptor : public BackendDescriptor +{ +private: + ExecutionPlanBuilder mBuilder; + ExecutionPlan mExecutionPlan; + + miopenBackendDescriptor_t mEngineCfgDescriptor = nullptr; + +public: + void setAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t elementCount, + void* arrayOfElements) override; + void finalize() override; + void getAttribute(miopenBackendAttributeName_t attributeName, + miopenBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t* elementCount, + void* arrayOfElements) override; + void execute(miopenHandle_t handle, miopenBackendDescriptor_t variantPack) override; +}; + +} // namespace graphapi + +} // namespace miopen diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp index 5b82e88d3d..506f3d319b 100644 --- a/src/include/miopen/handle.hpp +++ b/src/include/miopen/handle.hpp @@ -125,7 +125,7 @@ struct MIOPEN_EXPORT Handle : miopenHandle return this->Run(ks.front()); } - KernelInvoke Run(Kernel k) const; + KernelInvoke Run(Kernel k, bool coop_launch = false) const; const std::vector& GetKernelsImpl(const std::string& algorithm, const std::string& network_config) const; @@ -154,6 +154,7 @@ struct MIOPEN_EXPORT Handle : miopenHandle std::size_t m_MaxMemoryAllocSizeCached = 0; std::size_t GetMaxMemoryAllocSize(); + bool CooperativeLaunchSupported() const; std::string GetDeviceName() const; const TargetProperties& GetTargetProperties() const; diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp index cb42faf3a1..9ff4e4d22d 100644 --- a/src/include/miopen/hipoc_kernel.hpp +++ b/src/include/miopen/hipoc_kernel.hpp @@ -110,26 +110,29 @@ struct KernelArgs struct HIPOCKernelInvoke { - hipStream_t stream = nullptr; - hipFunction_t fun = nullptr; - std::array ldims = {}; - std::array gdims = {}; - std::string name; - std::function callback; - - // Workaround for aggregate types in c++11 HIPOCKernelInvoke() {} HIPOCKernelInvoke(hipStream_t pstream, hipFunction_t pfun, std::array pldims, std::array pgdims, std::string pname, - std::function pcallback) - : stream(pstream), fun(pfun), ldims(pldims), gdims(pgdims), name(pname), callback(pcallback) + std::function pcallback, + bool pcoop_launch) + : stream(pstream), + fun(pfun), + ldims(pldims), + gdims(pgdims), + name(pname), + callback(pcallback), + coop_launch(pcoop_launch) { } + void operator()(std::vector& any_args) const { + if(coop_launch) + MIOPEN_THROW(miopenStatusNotImplemented); + char hip_args[256] = {0}; auto sz_left = any_args[0].size(); @@ -152,13 +155,35 @@ struct HIPOCKernelInvoke template void operator()(Ts... xs) const { - KernelArgs args{xs...}; - run(&args, sizeof(args)); + if(coop_launch) + { + auto args = std::array{(&xs)...}; + run_cooperative(args.data()); + } + else + { + KernelArgs args{xs...}; + run(&args, sizeof(args)); + } } - void run(void* args, std::size_t size) const; + void SetLocalDims(size_t dim_x, size_t dim_y, size_t dim_z) { ldims = {dim_x, dim_y, dim_z}; } + + void SetGlobalDims(size_t dim_x, size_t dim_y, size_t dim_z) { gdims = {dim_x, dim_y, dim_z}; } const std::string& GetName() const { return name; } + +private: + void run(void* args, std::size_t size) const; + void run_cooperative(void** kern_args) const; + + hipStream_t stream = nullptr; + hipFunction_t fun = nullptr; + std::array ldims = {}; + std::array gdims = {}; + std::string name; + std::function callback; + bool coop_launch; }; struct HIPOCKernel @@ -196,7 +221,8 @@ struct HIPOCKernel } HIPOCKernelInvoke Invoke(hipStream_t stream, - std::function callback = nullptr) const; + std::function callback = nullptr, + bool coop_launch = false) const; }; } // namespace miopen diff --git a/src/include/miopen/layernorm.hpp b/src/include/miopen/layernorm.hpp index 3a8bf54a90..0506886f28 100644 --- a/src/include/miopen/layernorm.hpp +++ b/src/include/miopen/layernorm.hpp @@ -51,4 +51,4 @@ miopenStatus_t LayerNormForward(Handle& handle, int32_t normalized_dim); } // namespace miopen -#endif // _MIOPEN_LAYERNORM_HPP_ +#endif // MIOPEN_LAYERNORM_HPP_ diff --git a/src/include/miopen/layernorm/invoke_params.hpp b/src/include/miopen/layernorm/invoke_params.hpp index b97bac7d08..5cdff22dcc 100644 --- a/src/include/miopen/layernorm/invoke_params.hpp +++ b/src/include/miopen/layernorm/invoke_params.hpp @@ -52,6 +52,64 @@ struct InvokeParams : public miopen::InvokeParams Data_t GetWorkspace() const { return nullptr; } }; +struct AddInvokeParams : public miopen::InvokeParams +{ + AddInvokeParams() = default; + + const TensorDescriptor* xDesc = nullptr; + + ConstData_t x = nullptr; + ConstData_t x2 = nullptr; + ConstData_t weight = nullptr; + ConstData_t bias = nullptr; + Data_t y = nullptr; + Data_t mean = nullptr; + Data_t rstd = nullptr; + float epsilon = 0; + int32_t normalized_dim = 0; + miopenNormMode_t mode = MIOPEN_ELEMENTWISE_AFFINE; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +struct T5InvokeParams : public miopen::InvokeParams +{ + T5InvokeParams() = default; + + const TensorDescriptor* xDesc = nullptr; + + ConstData_t x = nullptr; + ConstData_t weight = nullptr; + Data_t y = nullptr; + Data_t rstd = nullptr; + float epsilon = 0; + miopenNormMode_t mode = MIOPEN_ELEMENTWISE_AFFINE; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +struct T5BwdInvokeParams : public miopen::InvokeParams +{ + T5BwdInvokeParams() = default; + + const TensorDescriptor* dyDesc = nullptr; + + ConstData_t dy = nullptr; + ConstData_t x = nullptr; + ConstData_t weight = nullptr; + ConstData_t rstd = nullptr; + Data_t dx = nullptr; + Data_t dw = nullptr; + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + miopenNormMode_t mode = MIOPEN_ELEMENTWISE_AFFINE; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + } // namespace layernorm } // namespace miopen diff --git a/src/include/miopen/layernorm/problem_description.hpp b/src/include/miopen/layernorm/problem_description.hpp index 78a631b292..2c09f7cc40 100644 --- a/src/include/miopen/layernorm/problem_description.hpp +++ b/src/include/miopen/layernorm/problem_description.hpp @@ -37,6 +37,12 @@ struct NetworkConfig; namespace layernorm { +enum class Direction +{ + Forward, + Backward, +}; + struct ProblemDescription : ProblemDescriptionBase { ProblemDescription(miopenNormMode_t mode_, @@ -60,41 +66,133 @@ struct ProblemDescription : ProblemDescriptionBase { } + ProblemDescription(miopenNormMode_t mode_, + const TensorDescriptor& xDesc_, + const TensorDescriptor& x2Desc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& yDesc_, + const TensorDescriptor& meanDesc_, + const TensorDescriptor& rstdDesc_, + float epsilon_, + int32_t normalized_dim_) + : mode(mode_), + xDesc(xDesc_), + x2Desc(x2Desc_), + weightDesc(weightDesc_), + biasDesc(biasDesc_), + yDesc(yDesc_), + meanDesc(meanDesc_), + rstdDesc(rstdDesc_), + epsilon(epsilon_), + normalized_dim(normalized_dim_) + { + } + + ProblemDescription(miopenNormMode_t mode_, + const TensorDescriptor& xDesc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& yDesc_, + const TensorDescriptor& rstdDesc_, + float epsilon_) + : direction(Direction::Forward), + mode(mode_), + xDesc(xDesc_), + weightDesc(weightDesc_), + yDesc(yDesc_), + rstdDesc(rstdDesc_), + epsilon(epsilon_) + { + } + + ProblemDescription(miopenNormMode_t mode_, + const TensorDescriptor& dyDesc_, + const TensorDescriptor& xDesc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& rstdDesc_, + const TensorDescriptor& dxDesc_, + const TensorDescriptor& dwDesc_) + : direction(Direction::Backward), + mode(mode_), + xDesc(xDesc_), + weightDesc(weightDesc_), + rstdDesc(rstdDesc_), + dyDesc(dyDesc_), + dxDesc(dxDesc_), + dwDesc(dwDesc_) + { + } + + Direction GetDirection() const { return direction; } miopenNormMode_t GetMode() const { return mode; } const TensorDescriptor& GetXDesc() const { return xDesc; } + const TensorDescriptor& GetX2Desc() const { return x2Desc; } const TensorDescriptor& GetWeightDesc() const { return weightDesc; } const TensorDescriptor& GetBiasDesc() const { return biasDesc; } const TensorDescriptor& GetYDesc() const { return yDesc; } const TensorDescriptor& GetMeanDesc() const { return meanDesc; } const TensorDescriptor& GetRstdDesc() const { return rstdDesc; } + const TensorDescriptor& GetDYDesc() const { return dyDesc; } + const TensorDescriptor& GetDXDesc() const { return dxDesc; } + const TensorDescriptor& GetDWDesc() const { return dwDesc; } float GetEpsilon() const { return epsilon; } int32_t GetNormalizedDim() const { return normalized_dim; } bool IsSameType() const { - if(xDesc.GetType() != yDesc.GetType()) + if(direction == Direction::Forward) { + if(xDesc.GetType() != yDesc.GetType()) + { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, "LayerNormForward: Tensor types do not match."); + MIOPEN_THROW(miopenStatusBadParm, "LayerNormForward: Tensor types do not match."); #else - return false; + return false; +#endif + } + } + else + { + if(dyDesc.GetType() != dxDesc.GetType()) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, "LayerNormBackward: Tensor types do not match."); +#else + return false; #endif + } } return true; } bool IsSameLength() const { - if(xDesc.GetLengths() != yDesc.GetLengths()) + if(direction == Direction::Forward) { + if(xDesc.GetLengths() != yDesc.GetLengths()) + { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "LayerNormForward: Tensor dimension lengths do not match."); + MIOPEN_THROW(miopenStatusBadParm, + "LayerNormForward: Tensor dimension lengths do not match."); #else - return false; + return false; #endif + } + return true; + } + else + { + if(dyDesc.GetLengths() != dxDesc.GetLengths()) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "LayerNormBackward: Tensor dimension lengths do not match."); +#else + return false; +#endif + } + return true; } - return true; } bool IsRightNormDim() const @@ -115,14 +213,31 @@ struct ProblemDescription : ProblemDescriptionBase bool IsAllPacked() const { - if(!(xDesc.IsPacked() && weightDesc.IsPacked() && biasDesc.IsPacked() && yDesc.IsPacked() && - meanDesc.IsPacked() && rstdDesc.IsPacked())) + if(direction == Direction::Forward) { + if(!(xDesc.IsPacked() && weightDesc.IsPacked() && biasDesc.IsPacked() && + yDesc.IsPacked() && meanDesc.IsPacked() && rstdDesc.IsPacked())) + { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, "LayerNormForward: Unpacked tensors not supported."); + MIOPEN_THROW(miopenStatusBadParm, + "LayerNormForward: Unpacked tensors not supported."); #else - return false; + return false; +#endif + } + } + else + { + if(!(dyDesc.IsPacked() && xDesc.IsPacked() && weightDesc.IsPacked() && + rstdDesc.IsPacked() && dxDesc.IsPacked() && dwDesc.IsPacked())) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "LayerNormBackward: Unpacked tensors not supported."); +#else + return false; #endif + } } return true; } @@ -143,13 +258,18 @@ struct ProblemDescription : ProblemDescriptionBase NetworkConfig MakeNetworkConfig() const override; private: + Direction direction; miopenNormMode_t mode; TensorDescriptor xDesc; + TensorDescriptor x2Desc; TensorDescriptor weightDesc; TensorDescriptor biasDesc; TensorDescriptor yDesc; TensorDescriptor meanDesc; TensorDescriptor rstdDesc; + TensorDescriptor dyDesc; + TensorDescriptor dxDesc; + TensorDescriptor dwDesc; float epsilon; int32_t normalized_dim; diff --git a/src/include/miopen/layernorm/solvers.hpp b/src/include/miopen/layernorm/solvers.hpp index 503bb87fb6..f386e456b2 100644 --- a/src/include/miopen/layernorm/solvers.hpp +++ b/src/include/miopen/layernorm/solvers.hpp @@ -68,6 +68,40 @@ struct Layernorm4DCKForward final : NormalizationSolver const miopen::layernorm::ProblemDescription& problem) const override; }; +struct AddLayernormForward final : NormalizationSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const override; +}; + +struct T5LayernormForward final : NormalizationSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const override; +}; + +struct T5LayernormBackward final : NormalizationSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const override; + bool MayNeedWorkspace() const override { return true; } +}; + } // namespace layernorm } // namespace solver diff --git a/src/include/miopen/layernorm/utils.hpp b/src/include/miopen/layernorm/utils.hpp new file mode 100644 index 0000000000..1e40ea1e0b --- /dev/null +++ b/src/include/miopen/layernorm/utils.hpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_LAYERNORM_UTILS_HPP_ +#define MIOPEN_LAYERNORM_UTILS_HPP_ + +#include + +namespace miopen { +namespace solver { +namespace layernorm { + +#define LOCAL_SIZE 256 + +inline std::size_t sizeof_kernel_FLOAT(const miopen::layernorm::ProblemDescription& problem) +{ + const auto datatype = problem.GetXDesc().GetType(); + return get_data_size(datatype); +} + +inline std::size_t sizeof_local_memory(const miopen::layernorm::ProblemDescription& problem) +{ + std::size_t rv = 0; + rv += LOCAL_SIZE * sizeof_kernel_FLOAT(problem) * 2; + return rv; +} + +inline std::size_t sizeof_local_memory_t5(const miopen::layernorm::ProblemDescription& problem) +{ + std::size_t rv = 0; + rv += LOCAL_SIZE * sizeof_kernel_FLOAT(problem); + return rv; +} + +inline size_t get_reqd_work_item_cnt(const ExecutionContext& context) +{ + // At least 4 WGs per one CU + return static_cast(LOCAL_SIZE * context.GetStream().GetMaxComputeUnits() * 4); +} + +inline size_t get_reqd_work_item_cnt(const Handle& handle) +{ + // At least 4 WGs per one CU + return static_cast(LOCAL_SIZE * handle.GetMaxComputeUnits() * 4); +} + +inline size_t get_parallelism_size(size_t reqd_work_item_cnt, size_t inner_size, size_t outer_size) +{ + size_t parallelism_size = 1ULL; + while(parallelism_size * inner_size < reqd_work_item_cnt && + parallelism_size < std::sqrt(outer_size)) + { + parallelism_size *= 2ULL; + } + return parallelism_size; +} + +inline bool is_parallelism(size_t reqd_work_item_cnt, size_t inner_size, size_t outer_size) +{ + return !(inner_size > reqd_work_item_cnt) && (inner_size * outer_size > reqd_work_item_cnt); +} + +} // namespace layernorm +} // namespace solver +} // namespace miopen + +#endif // _MIOPEN_LAYERNORM_UTILS_HPP_ diff --git a/src/include/miopen/reduce/invoke_params.hpp b/src/include/miopen/reduce/invoke_params.hpp index 6ad0884dfd..11923fdd79 100644 --- a/src/include/miopen/reduce/invoke_params.hpp +++ b/src/include/miopen/reduce/invoke_params.hpp @@ -36,11 +36,13 @@ struct InvokeParams : public miopen::InvokeParams { InvokeParams() = default; - const TensorDescriptor* xDesc = nullptr; - const TensorDescriptor* yDesc = nullptr; + const TensorDescriptor* xDesc = nullptr; + const TensorDescriptor* yDesc = nullptr; + const TensorDescriptor* indiceDesc = nullptr; ConstData_t x = nullptr; Data_t y = nullptr; + Data_t indice = nullptr; Data_t workspace = nullptr; std::size_t workspace_size = 0; int32_t dim = 0; diff --git a/src/include/miopen/reduce/problem_description.hpp b/src/include/miopen/reduce/problem_description.hpp index 0f69b06bab..03001a155b 100644 --- a/src/include/miopen/reduce/problem_description.hpp +++ b/src/include/miopen/reduce/problem_description.hpp @@ -47,65 +47,102 @@ struct ProblemDescription : ProblemDescriptionBase { } - ProblemDescription(const TensorDescriptor& xDesc_, const TensorDescriptor& yDesc_, int32_t dim_) - : xDesc(xDesc_), yDesc(yDesc_), dim(dim_) + ProblemDescription(const TensorDescriptor& xDesc_, + const TensorDescriptor& yDesc_, + const TensorDescriptor& indiceDesc_, + int32_t dim_, + miopenReduceExtremeOp_t reduceExtremeOp_) + : xDesc(xDesc_), + yDesc(yDesc_), + indiceDesc(indiceDesc_), + dim(dim_), + reduceExtremeOp(reduceExtremeOp_) + { + } + + ProblemDescription(const TensorDescriptor& xDesc_, + const TensorDescriptor& indiceDesc_, + int32_t dim_, + miopenReduceExtremeOp_t reduceExtremeOp_) + : xDesc(xDesc_), indiceDesc(indiceDesc_), dim(dim_), reduceExtremeOp(reduceExtremeOp_) { } miopenSumNanPropagation_t GetNanPropagation_() const { return nanPropagation; } const TensorDescriptor& GetXDesc() const { return xDesc; } const TensorDescriptor& GetYDesc() const { return yDesc; } + const TensorDescriptor& GetIndiceDesc() const { return indiceDesc; } int32_t GetDim() const { return dim; } - bool IsSameType() const + bool IsValidLength() const { - if(xDesc.GetType() != yDesc.GetType()) + if(xDesc.GetLengths().size() == 1) + return true; + + int32_t posy = 0; + for(int32_t i = 0; i < xDesc.GetLengths().size(); ++i) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, "Reduce: Tensor types do not match."); -#else - return false; -#endif + if(i == dim) + continue; + + if(xDesc.GetLengths()[i] != yDesc.GetLengths()[posy]) + { + MIOPEN_THROW(miopenStatusBadParm, "Reduce: Tensor dimension lengths do not match."); + } + + ++posy; } return true; } - bool IsRightLength() const + bool IsValidLengthIndice() const { if(xDesc.GetLengths().size() == 1) return true; int32_t posy = 0; - for(int32_t i = 0; i < xDesc.GetLengths().size(); i++) + for(int32_t i = 0; i < xDesc.GetLengths().size(); ++i) { if(i == dim) continue; - if(xDesc.GetLengths()[i] != yDesc.GetLengths()[posy]) + if(xDesc.GetLengths()[i] != indiceDesc.GetLengths()[posy]) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Reduce: Tensor dimension lengths do not match."); -#else - return false; -#endif } - posy++; + ++posy; } return true; } - bool IsRightDim() const + bool IsValidDim() const { if((dim < 0) || (dim > xDesc.GetLengths().size())) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW( miopenStatusBadParm, "Reduce: is greater than 0 and less than or equal tensor dimension length."); -#else + } + return true; + } + + bool IsValidInputNumel() const + { + auto xdims = xDesc.GetLengths(); + auto input_numel = + std::accumulate(xdims.begin(), xdims.end(), 1ULL, std::multiplies()); + if(input_numel > INT32_MAX) + MIOPEN_THROW(miopenStatusBadParm, "Reduce: input numel is bigger than INT_MAX."); + + return true; + } + + bool IsSameType() const + { + if(xDesc.GetType() != yDesc.GetType()) + { return false; -#endif } return true; } @@ -114,12 +151,29 @@ struct ProblemDescription : ProblemDescriptionBase { if(!(xDesc.IsPacked() && yDesc.IsPacked())) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, "Reduce: Unpacked tensors not supported."); -#else return false; -#endif } + + return true; + } + + bool IsAllPackedWithIndice() const + { + if(!(xDesc.IsPacked() && yDesc.IsPacked() && indiceDesc.IsPacked())) + { + return false; + } + + return true; + } + + bool IsAllPackedIndice() const + { + if(!(xDesc.IsPacked() && indiceDesc.IsPacked())) + { + return false; + } + return true; } @@ -130,14 +184,23 @@ struct ProblemDescription : ProblemDescriptionBase return true; } + bool IsLargeReduceSize() const + { + if(xDesc.GetLengths()[dim] > 64) + return false; + return true; + } + NetworkConfig MakeNetworkConfig() const override; private: miopenSumNanPropagation_t nanPropagation; TensorDescriptor xDesc; TensorDescriptor yDesc; + TensorDescriptor indiceDesc; int32_t dim; + miopenReduceExtremeOp_t reduceExtremeOp; NetworkConfig MakeForwardNetworkConfig() const; }; diff --git a/src/include/miopen/reduce/solvers.hpp b/src/include/miopen/reduce/solvers.hpp index 9aa584233d..e17f93124d 100644 --- a/src/include/miopen/reduce/solvers.hpp +++ b/src/include/miopen/reduce/solvers.hpp @@ -53,6 +53,48 @@ struct SumForward final : ReduceSolver struct ArgmaxForward final : ReduceSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } + size_t XGridSize(std::vector indicedims) const; + bool OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const; + + bool IsApplicable(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const override; +}; + +struct ArgminForward final : ReduceSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + size_t XGridSize(std::vector indicedims) const; + bool OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const; + + bool IsApplicable(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const override; +}; + +struct MaxForward final : ReduceSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + size_t XGridSize(std::vector ydims) const; + bool OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const; + + bool IsApplicable(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const override; +}; + +struct MinForward final : ReduceSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + size_t XGridSize(std::vector ydims) const; + bool OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const; bool IsApplicable(const ExecutionContext& context, const miopen::reduce::ProblemDescription& problem) const override; diff --git a/src/include/miopen/reduce/utils.hpp b/src/include/miopen/reduce/utils.hpp new file mode 100644 index 0000000000..a3280c062c --- /dev/null +++ b/src/include/miopen/reduce/utils.hpp @@ -0,0 +1,71 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_REDUCE_UTILS_HPP_ +#define MIOPEN_REDUCE_UTILS_HPP_ + +#include + +namespace miopen { +namespace solver { +namespace reduce { + +#define LOCAL_SIZE 256 + +inline size_t get_reqd_work_item_cnt(const ExecutionContext& context) +{ + // At least 4 WGs per one CU + return static_cast(LOCAL_SIZE * context.GetStream().GetMaxComputeUnits() * 4); +} + +inline size_t get_reqd_work_item_cnt(const Handle& handle) +{ + // At least 4 WGs per one CU + return static_cast(LOCAL_SIZE * handle.GetMaxComputeUnits() * 4); +} + +inline size_t +get_parallelism_size(size_t reqd_work_item_cnt, size_t output_numel, size_t reduce_size) +{ + size_t parallelism_size = 1ULL; + while(parallelism_size * output_numel < reqd_work_item_cnt && + parallelism_size < std::sqrt(reduce_size)) + { + parallelism_size *= 2ULL; + } + return parallelism_size; +} + +inline bool is_parallelism(size_t reqd_work_item_cnt, size_t output_numel, size_t reduce_size) +{ + return !(output_numel > reqd_work_item_cnt) && + (output_numel * reduce_size > reqd_work_item_cnt); +} + +} // namespace reduce +} // namespace solver +} // namespace miopen + +#endif // _MIOPEN_REDUCE_UTILS_HPP_ diff --git a/src/include/miopen/reduceextreme.hpp b/src/include/miopen/reduceextreme.hpp new file mode 100644 index 0000000000..b071a21286 --- /dev/null +++ b/src/include/miopen/reduceextreme.hpp @@ -0,0 +1,55 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_REDUCEEXTREME_HPP_ +#define MIOPEN_REDUCEEXTREME_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +miopenStatus_t ReduceExtremeForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& indiceDesc, + Data_t indice, + int32_t dim, + miopenReduceExtremeOp_t reduceExtremeOp); + +miopenStatus_t ReduceExtremeForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& indiceDesc, + Data_t indice, + int32_t dim, + miopenReduceExtremeOp_t reduceExtremeOp); + +} // namespace miopen +#endif // MIOPEN_REDUCEEXTREME_HPP_ diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index b626a5702d..e4ba4d4383 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -2352,12 +2352,12 @@ struct ConvWinoFuryRxS final : ConvSolver const miopen::conv::ProblemDescription&) const override; bool IsDynamic() const override { return true; } float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, + const miopen::conv::ProblemDescription&) const override; + bool MayNeedWorkspace() const override { return true; } ConvSolution GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - static constexpr bool is2x3() { return Winodata == 2 && Winofilter == 3; } - static constexpr bool is3x2() { return Winodata == 3 && Winofilter == 2; } }; // Suppress misleading clang warnings diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index c52dc020ac..2ad6e619d7 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -56,7 +56,8 @@ enum class Primitive Reduce, Cat, Mha, - Softmax + Softmax, + Adam }; struct MIOPEN_EXPORT Id diff --git a/src/include/miopen/t5layernorm.hpp b/src/include/miopen/t5layernorm.hpp new file mode 100644 index 0000000000..62424244f6 --- /dev/null +++ b/src/include/miopen/t5layernorm.hpp @@ -0,0 +1,75 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_T5LAYERNORM_HPP_ +#define MIOPEN_T5LAYERNORM_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +miopenStatus_t T5LayerNormForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& rstdDesc, + Data_t rstd, + miopenNormMode_t mode, + float epsilon); + +std::size_t GetT5LayerNormBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& dyDesc, + const TensorDescriptor& xDesc, + const TensorDescriptor& weightDesc, + const TensorDescriptor& rstdDesc, + const TensorDescriptor& dxDesc, + const TensorDescriptor& dwDesc, + miopenNormMode_t mode); + +miopenStatus_t T5LayerNormBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& dyDesc, + ConstData_t dy, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& rstdDesc, + ConstData_t rstd, + const TensorDescriptor& dxDesc, + Data_t dx, + const TensorDescriptor& dwDesc, + Data_t dw, + miopenNormMode_t mode); + +} // namespace miopen +#endif // MIOPEN_T5LAYERNORM_HPP_ diff --git a/src/kernels/MIOpenAdam.cpp b/src/kernels/MIOpenAdam.cpp new file mode 100644 index 0000000000..71e5b8ef45 --- /dev/null +++ b/src/kernels/MIOpenAdam.cpp @@ -0,0 +1,378 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" + +template +inline __device__ void AdamInternal(T1* param_in, + T1* param_out, + T1* exp_avg_in, + T1* exp_avg_out, + T1* exp_avg_sq_in, + T1* exp_avg_sq_out, + T1* max_exp_avg_sq_in, + T1* max_exp_avg_sq_out, + T2 grad, + T2 lr, + T2 beta1, + T2 beta2, + T2 weight_decay, + T2 eps, + uint32_t step, + bool amsgrad, + bool maximize, + size_t gid) +{ + T2 param = static_cast(param_in[gid]); + T2 exp_avg = static_cast(exp_avg_in[gid]); + T2 exp_avg_sq = static_cast(exp_avg_sq_in[gid]); + + __builtin_assume(exp_avg_sq >= 0 && exp_avg_sq <= 1); + __builtin_assume(beta1 >= 0); + __builtin_assume(beta2 >= 0); + + T2 bias_correction1 = 1 - pow(beta1, step); + T2 bias_correction2 = 1 - pow(beta2, step); + + if(maximize) + grad *= -1; + if(weight_decay != 0) + grad += param * weight_decay; + + exp_avg = exp_avg * beta1 + grad * (1 - beta1); + exp_avg_sq = exp_avg_sq * beta2 + grad * grad * (1 - beta2); + + T2 denom; + if(amsgrad) + { + T2 max_exp_avg_sq = static_cast(max_exp_avg_sq_in[gid]); + __builtin_assume(max_exp_avg_sq >= 0 && max_exp_avg_sq <= 1); + max_exp_avg_sq = max(max_exp_avg_sq, exp_avg_sq); + max_exp_avg_sq_out[gid] = static_cast(max_exp_avg_sq); + denom = sqrt(max_exp_avg_sq) / sqrt(bias_correction2) + eps; + } + else + { + denom = sqrt(exp_avg_sq) / sqrt(bias_correction2) + eps; + } + + T2 step_size = lr / bias_correction1; + param = param - step_size * exp_avg / denom; + + param_out[gid] = static_cast(param); + exp_avg_out[gid] = static_cast(exp_avg); + exp_avg_sq_out[gid] = static_cast(exp_avg_sq); +} + +extern "C" __global__ void AdamPacked(PTYPE* param_in, + PTYPE* param_out, + PTYPE* grad_in, + PTYPE* exp_avg_in, + PTYPE* exp_avg_out, + PTYPE* exp_avg_sq_in, + PTYPE* exp_avg_sq_out, + PTYPE* max_exp_avg_sq_in, + PTYPE* max_exp_avg_sq_out, + float lr, + float beta1, + float beta2, + float weight_decay, + float eps, + uint32_t step, + bool amsgrad, + bool maximize, + size_t input_size) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + size_t gsz = gridDim.x * blockDim.x; + + for(; gid < input_size; gid += gsz) + { + CTYPE grad = static_cast(grad_in[gid]); + + AdamInternal(param_in, + param_out, + exp_avg_in, + exp_avg_out, + exp_avg_sq_in, + exp_avg_sq_out, + max_exp_avg_sq_in, + max_exp_avg_sq_out, + grad, + lr, + beta1, + beta2, + weight_decay, + eps, + step, + amsgrad, + maximize, + gid); + } +} + +template +inline __device__ void AmpAdamInternal(T1* param_in, + T1* param_out, + half* param_out_fp16, + T3* grad_in, + T1* exp_avg_in, + T1* exp_avg_out, + T1* exp_avg_sq_in, + T1* exp_avg_sq_out, + T1* max_exp_avg_sq_in, + T1* max_exp_avg_sq_out, + int32_t* grad_scale, + T2 lr, + T2 beta1, + T2 beta2, + T2 weight_decay, + T2 eps, + uint32_t step, + bool amsgrad, + bool maximize, + size_t input_size) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + size_t gsz = gridDim.x * blockDim.x; + + CTYPE scale_factor = (grad_scale) ? static_cast(*grad_scale) : 1.0f; + + for(; gid < input_size; gid += gsz) + { + CTYPE grad = static_cast(grad_in[gid]); + if(grad_scale) + grad /= scale_factor; + + AdamInternal(param_in, + param_out, + exp_avg_in, + exp_avg_out, + exp_avg_sq_in, + exp_avg_sq_out, + max_exp_avg_sq_in, + max_exp_avg_sq_out, + grad, + lr, + beta1, + beta2, + weight_decay, + eps, + step, + amsgrad, + maximize, + gid); + + if(param_out_fp16) + param_out_fp16[gid] = static_cast(param_out[gid]); + } +} + +template +inline __device__ void AmpAdamSetOutputFromInput(T1* param_in, + T1* param_out, + half* param_out_fp16, + T1* exp_avg_in, + T1* exp_avg_out, + T1* exp_avg_sq_in, + T1* exp_avg_sq_out, + T1* max_exp_avg_sq_in, + T1* max_exp_avg_sq_out, + bool amsgrad, + size_t input_size) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + size_t gsz = gridDim.x * blockDim.x; + + for(; gid < input_size; gid += gsz) + { + if(param_in != param_out) + param_out[gid] = param_in[gid]; + if(param_out_fp16) + param_out_fp16[gid] = static_cast(param_in[gid]); + if(exp_avg_in != exp_avg_out) + exp_avg_out[gid] = exp_avg_in[gid]; + if(exp_avg_sq_in != exp_avg_sq_out) + exp_avg_sq_out[gid] = exp_avg_sq_in[gid]; + if(amsgrad && max_exp_avg_sq_in != max_exp_avg_sq_out) + max_exp_avg_sq_out[gid] = max_exp_avg_sq_in[gid]; + } +} + +extern "C" __global__ void AmpAdamPackedWithStep(PTYPE* param_in, + PTYPE* param_out, + half* param_out_fp16, + GTYPE* grad_in, + PTYPE* exp_avg_in, + PTYPE* exp_avg_out, + PTYPE* exp_avg_sq_in, + PTYPE* exp_avg_sq_out, + PTYPE* max_exp_avg_sq_in, + PTYPE* max_exp_avg_sq_out, + int32_t* grad_scale, + bool* found_inf, + int* step, + float lr, + float beta1, + float beta2, + float weight_decay, + float eps, + bool amsgrad, + bool maximize, + size_t input_size) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + if(gid >= input_size) + return; + + if(found_inf == nullptr || *found_inf == false) + { + uint32_t step_val = static_cast(*step) + 1; + + AmpAdamInternal(param_in, + param_out, + param_out_fp16, + grad_in, + exp_avg_in, + exp_avg_out, + exp_avg_sq_in, + exp_avg_sq_out, + max_exp_avg_sq_in, + max_exp_avg_sq_out, + grad_scale, + lr, + beta1, + beta2, + weight_decay, + eps, + step_val, + amsgrad, + maximize, + input_size); + } + else + { + AmpAdamSetOutputFromInput(param_in, + param_out, + param_out_fp16, + exp_avg_in, + exp_avg_out, + exp_avg_sq_in, + exp_avg_sq_out, + max_exp_avg_sq_in, + max_exp_avg_sq_out, + amsgrad, + input_size); + } +} + +extern "C" __global__ void AmpAdamPacked(PTYPE* param_in, + PTYPE* param_out, + half* param_out_fp16, + GTYPE* grad_in, + PTYPE* exp_avg_in, + PTYPE* exp_avg_out, + PTYPE* exp_avg_sq_in, + PTYPE* exp_avg_sq_out, + PTYPE* max_exp_avg_sq_in, + PTYPE* max_exp_avg_sq_out, + int32_t* grad_scale, + bool* found_inf, + int step, + float lr, + float beta1, + float beta2, + float weight_decay, + float eps, + bool amsgrad, + bool maximize, + size_t input_size) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + if(gid >= input_size) + return; + + if(found_inf == nullptr || *found_inf == false) + { + AmpAdamInternal(param_in, + param_out, + param_out_fp16, + grad_in, + exp_avg_in, + exp_avg_out, + exp_avg_sq_in, + exp_avg_sq_out, + max_exp_avg_sq_in, + max_exp_avg_sq_out, + grad_scale, + lr, + beta1, + beta2, + weight_decay, + eps, + step, + amsgrad, + maximize, + input_size); + } + else + { + AmpAdamSetOutputFromInput(param_in, + param_out, + param_out_fp16, + exp_avg_in, + exp_avg_out, + exp_avg_sq_in, + exp_avg_sq_out, + max_exp_avg_sq_in, + max_exp_avg_sq_out, + amsgrad, + input_size); + } +} + +extern "C" __global__ void AdamUpdateStep(bool* found_inf, int* step_in, int* step_out) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + if(gid != 0) + return; + + if(found_inf && *found_inf) + { + if(step_in != step_out) + *step_out = *step_in; + return; + } + + *step_out = *step_in + 1; +} diff --git a/src/kernels/MIOpenLayerNorm.cpp b/src/kernels/MIOpenLayerNorm.cpp index b73632b9c5..2c32ad70a6 100644 --- a/src/kernels/MIOpenLayerNorm.cpp +++ b/src/kernels/MIOpenLayerNorm.cpp @@ -31,23 +31,16 @@ #include "miopen_cstdint.hpp" #include "float_types.h" -#if MIOPEN_USE_BFP16 == 1 -#define CVT_FLOAT2ACCUM(x) (bfloat16_to_float(x)) -#define CVT_ACCUM2FLOAT(x) (float_to_bfloat16(x)) -#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x)) -#define CVT_FP32_2FLOAT(x) (CVT_ACCUM2FLOAT(x)) -#define CVT_FP32_2ACCUM(x) (x) -#endif - -extern "C" __global__ void LayernormFwdContiguous(const FLOAT* __restrict__ x, - FLOAT* __restrict__ y, - const FLOAT* __restrict__ weight, - const FLOAT* __restrict__ bias, - FLOAT_ACCUM* __restrict__ mean, - FLOAT_ACCUM* __restrict__ rstd, - float eps, - uint64_t inner_size, - bool mode) +template +__device__ void layernormfwdcontiguous(const TI* __restrict__ x, + const TI* __restrict__ weight, + const TI* __restrict__ bias, + TO* __restrict__ y, + TO* __restrict__ mean, + TO* __restrict__ rstd, + float eps, + uint64_t inner_size, + bool mode) { /* * Each group works on a single channel. @@ -116,10 +109,379 @@ extern "C" __global__ void LayernormFwdContiguous(const FLOAT* __restrict__ x, FLOAT_ACCUM pweight; FLOAT_ACCUM pbias; - pweight = mode ? CVT_FLOAT2ACCUM(weight[i]) : CVT_FP32_2ACCUM(1.0f); - pbias = mode ? CVT_FLOAT2ACCUM(bias[i]) : static_cast(0); + pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE) ? CVT_FP32_2ACCUM(1.0f) + : CVT_FLOAT2ACCUM(weight[i]); + pbias = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? static_cast(0) : CVT_FLOAT2ACCUM(bias[i]); FLOAT_ACCUM val = (CVT_FLOAT2ACCUM(x[idx]) - pmean) * prstd * pweight + pbias; y[idx] = CVT_ACCUM2FLOAT(val); } } + +template +__device__ void addlayernormfwdcontiguous(const TI* __restrict__ x, + const TI* __restrict__ x2, + const TI* __restrict__ weight, + const TI* __restrict__ bias, + TO* __restrict__ y, + TO* __restrict__ mean, + TO* __restrict__ rstd, + float eps, + uint64_t inner_size, + bool mode) +{ + const uint64_t gid = blockIdx.x; + const uint64_t lid = threadIdx.x; + + FLOAT_ACCUM pmean = static_cast(0); + FLOAT_ACCUM pvar = static_cast(0); + __shared__ FLOAT_ACCUM ltmp1[LOCAL_SIZE]; + __shared__ FLOAT_ACCUM ltmp2[LOCAL_SIZE]; + + // reduce sum for mean and var + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + size_t x_idx = gid * inner_size + i; + + FLOAT_ACCUM tmp = CVT_FLOAT2ACCUM(x[x_idx]) + CVT_FLOAT2ACCUM(x2[x_idx]); + pmean += tmp; + pvar += tmp * tmp; + } + + ltmp1[lid] = pmean; + ltmp2[lid] = pvar; + __syncthreads(); + for(uint32_t i = LOCAL_SIZE >> 1; i > 0; i >>= 1) + { + if(lid < i) + { + ltmp1[lid] += ltmp1[lid + i]; + ltmp2[lid] += ltmp2[lid + i]; + } + __syncthreads(); + } + pmean = ltmp1[0] / inner_size; + pvar = ltmp2[0] / inner_size - pmean * pmean; + FLOAT_ACCUM prstd = rsqrt(pvar + FLOAT_ACCUM(eps)); + + if(lid == 0) + { + if(mean) + mean[gid] = pmean; + if(rstd) + rstd[gid] = prstd; + } + + // forward calculation + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + size_t idx = gid * inner_size + i; + + FLOAT_ACCUM pweight; + FLOAT_ACCUM pbias; + + pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? CVT_FP32_2ACCUM(1.0f) + : CVT_FLOAT2ACCUM(weight[i]); + pbias = (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? static_cast(0) + : CVT_FLOAT2ACCUM(bias[i]); + + FLOAT_ACCUM val = + (CVT_FLOAT2ACCUM(x[idx]) + CVT_FLOAT2ACCUM(x2[idx]) - pmean) * prstd * pweight + pbias; + y[idx] = CVT_ACCUM2FLOAT(val); + } +} + +template +__device__ void t5layernormfwdcontiguous(const TI* __restrict__ x, + const TI* __restrict__ weight, + TO* __restrict__ y, + TO* __restrict__ rstd, + float eps, + uint64_t inner_size, + bool mode) +{ + const uint64_t gid = blockIdx.x; + const uint64_t lid = threadIdx.x; + + FLOAT_ACCUM pvar = static_cast(0); + __shared__ FLOAT_ACCUM ltmp[LOCAL_SIZE]; + + // reduce sum + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + size_t x_idx = gid * inner_size + i; + + FLOAT_ACCUM tmp = CVT_FLOAT2ACCUM(x[x_idx]); + pvar += tmp * tmp; + } + + ltmp[lid] = pvar; + __syncthreads(); + for(uint32_t i = LOCAL_SIZE >> 1; i > 0; i >>= 1) + { + if(lid < i) + { + ltmp[lid] += ltmp[lid + i]; + } + __syncthreads(); + } + pvar = ltmp[0] / inner_size; + FLOAT_ACCUM prstd = rsqrt(pvar + FLOAT_ACCUM(eps)); + + if(lid == 0) + { + if(rstd) + rstd[gid] = prstd; + } + + // forward calculation + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + size_t idx = gid * inner_size + i; + + FLOAT_ACCUM pweight; + + pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE_T5) ? CVT_FP32_2ACCUM(1.0f) + : CVT_FLOAT2ACCUM(weight[i]); + + FLOAT_ACCUM val = (CVT_FLOAT2ACCUM(x[idx])) * prstd * pweight; + y[idx] = CVT_ACCUM2FLOAT(val); + } +} + +template +__device__ void t5layernormbwdcontiguous(const TI* __restrict__ dy, + const TI* __restrict__ x, + const TI* __restrict__ weight, + const TI* __restrict__ rstd, + TO* __restrict__ dx, + uint64_t inner_size, + bool mode) +{ + const uint64_t gid = blockIdx.x; + const uint64_t lid = threadIdx.x; + + __shared__ FLOAT_ACCUM ltmp[LOCAL_SIZE]; + + // reduce sum + FLOAT_ACCUM sum = 0; + + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + size_t x_idx = gid * inner_size + i; + + FLOAT_ACCUM pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE_T5) ? CVT_FP32_2ACCUM(1.0f) + : CVT_FLOAT2ACCUM(weight[i]); + + FLOAT_ACCUM pdy = dy ? CVT_FLOAT2ACCUM(dy[x_idx]) : 0; + sum += pdy * CVT_FLOAT2ACCUM(x[x_idx]) * pweight; + } + + ltmp[lid] = sum; + __syncthreads(); + for(uint32_t i = LOCAL_SIZE >> 1; i > 0; i >>= 1) + { + if(lid < i) + { + ltmp[lid] += ltmp[lid + i]; + } + __syncthreads(); + } + + FLOAT_ACCUM ds = ltmp[0]; + FLOAT_ACCUM s = 1.0f / inner_size; + FLOAT_ACCUM prstd = CVT_FLOAT2ACCUM(rstd[gid]); + FLOAT_ACCUM a = ds * prstd * prstd * prstd * s; + + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + size_t idx = gid * inner_size + i; + + FLOAT_ACCUM pweight = (mode == MIOPEN_ELEMENTWISE_AFFINE_T5) ? CVT_FP32_2ACCUM(1.0f) + : CVT_FLOAT2ACCUM(weight[i]); + FLOAT_ACCUM pdy = dy ? CVT_FLOAT2ACCUM(dy[idx]) : 0; + + FLOAT_ACCUM val = prstd * pdy * pweight - a * CVT_FLOAT2ACCUM(x[idx]); + dx[idx] = CVT_ACCUM2FLOAT(val); + } +} + +template +__device__ void t5layernormbwdweightcontiguous(const TI* __restrict__ dy, + const TI* __restrict__ x, + const TI* __restrict__ rstd, + TO* __restrict__ dw, + uint64_t outer_size, + uint64_t inner_size) +{ + const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + FLOAT_ACCUM sum = static_cast(0); + for(uint64_t i = 0; i < outer_size; ++i) + { + uint64_t input_idx = i * inner_size + gid; + + FLOAT_ACCUM prstd = CVT_FLOAT2ACCUM(rstd[i]); + FLOAT_ACCUM pdy = dy ? CVT_FLOAT2ACCUM(dy[input_idx]) : 0; + + sum += pdy * CVT_FLOAT2ACCUM(x[input_idx]) * prstd; + } + + if(dw) + { + dw[gid] = CVT_ACCUM2FLOAT(sum); + } +} + +template +__device__ void t5layernormbwdweightcontiguousparallel(const TI* __restrict__ dy, + const TI* __restrict__ x, + const TI* __restrict__ rstd, + TO* __restrict__ workspace, + uint64_t outer_size, + uint64_t inner_size, + uint64_t parallel_size) +{ + const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + if(gid >= inner_size * parallel_size) + return; + + uint64_t nid = gid % inner_size; + uint64_t pid = gid / inner_size; + + uint64_t input_idx = gid; + + FLOAT_ACCUM sum = static_cast(0); + + if(dy) + { + for(uint64_t i = pid; i < outer_size; i += parallel_size) + { + FLOAT_ACCUM prstd = CVT_FLOAT2ACCUM(rstd[i]); + FLOAT_ACCUM pdy = CVT_FLOAT2ACCUM(dy[input_idx]); + + sum += pdy * CVT_FLOAT2ACCUM(x[input_idx]) * prstd; + input_idx += inner_size * parallel_size; + } + } + + workspace[gid] = CVT_ACCUM2FLOAT(sum); +} + +template +__device__ void t5layernormbwdcontiguousreduceSum(const TI* __restrict__ workspace, + TO* __restrict__ dw, + uint64_t inner_size, + uint64_t parallel_size) +{ + const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + if(gid >= inner_size) + return; + + FLOAT_ACCUM sum = static_cast(0); + for(uint64_t i = 0; i < parallel_size; ++i) + { + uint64_t input_idx = i * inner_size + gid; + sum += CVT_FLOAT2ACCUM(workspace[input_idx]); + } + + if(dw) + { + dw[gid] = CVT_ACCUM2FLOAT(sum); + } +} + +extern "C" __global__ void LayernormFwdContiguous(const INPUT_TYPE* __restrict__ x, + const INPUT_TYPE* __restrict__ weight, + const INPUT_TYPE* __restrict__ bias, + OUTPUT_TYPE* __restrict__ y, + OUTPUT_TYPE* __restrict__ mean, + OUTPUT_TYPE* __restrict__ rstd, + float eps, + uint64_t inner_size, + bool mode) +{ + // instantiate the kernel + layernormfwdcontiguous( + x, weight, bias, y, mean, rstd, eps, inner_size, mode); +} + +extern "C" __global__ void AddLayernormFwdContiguous(const INPUT_TYPE* __restrict__ x, + const INPUT_TYPE* __restrict__ x2, + const INPUT_TYPE* __restrict__ weight, + const INPUT_TYPE* __restrict__ bias, + OUTPUT_TYPE* __restrict__ y, + OUTPUT_TYPE* __restrict__ mean, + OUTPUT_TYPE* __restrict__ rstd, + float eps, + uint64_t inner_size, + bool mode) +{ + // instantiate the kernel + addlayernormfwdcontiguous( + x, x2, weight, bias, y, mean, rstd, eps, inner_size, mode); +} + +extern "C" __global__ void T5LayernormFwdContiguous(const INPUT_TYPE* __restrict__ x, + const INPUT_TYPE* __restrict__ weight, + OUTPUT_TYPE* __restrict__ y, + OUTPUT_TYPE* __restrict__ rstd, + float eps, + uint64_t inner_size, + bool mode) +{ + // instantiate the kernel + t5layernormfwdcontiguous(x, weight, y, rstd, eps, inner_size, mode); +} + +extern "C" __global__ void T5LayernormBwdContiguous(const INPUT_TYPE* __restrict__ dy, + const INPUT_TYPE* __restrict__ x, + const INPUT_TYPE* __restrict__ weight, + const INPUT_TYPE* __restrict__ rstd, + OUTPUT_TYPE* __restrict__ dx, + uint64_t inner_size, + bool mode) +{ + // instantiate the kernel + t5layernormbwdcontiguous(dy, x, weight, rstd, dx, inner_size, mode); +} + +extern "C" __global__ void T5LayernormBwdWeightContiguous(const INPUT_TYPE* __restrict__ dy, + const INPUT_TYPE* __restrict__ x, + const INPUT_TYPE* __restrict__ rstd, + OUTPUT_TYPE* __restrict__ dw, + uint64_t outer_size, + uint64_t inner_size) +{ + // instantiate the kernel + t5layernormbwdweightcontiguous( + dy, x, rstd, dw, outer_size, inner_size); +} + +extern "C" __global__ void +T5LayernormBwdWeightContiguousParallel(const INPUT_TYPE* __restrict__ dy, + const INPUT_TYPE* __restrict__ x, + const INPUT_TYPE* __restrict__ rstd, + OUTPUT_TYPE* __restrict__ workspace, + uint64_t outer_size, + uint64_t inner_size, + uint64_t parallel_size) +{ + // instantiate the kernel + t5layernormbwdweightcontiguousparallel( + dy, x, rstd, workspace, outer_size, inner_size, parallel_size); +} + +extern "C" __global__ void +T5LayernormBwdContiguousReduceSum(const INPUT_TYPE* __restrict__ workspace, + OUTPUT_TYPE* __restrict__ dw, + uint64_t inner_size, + uint64_t parallel_size) +{ + // instantiate the kernel + t5layernormbwdcontiguousreduceSum( + workspace, dw, inner_size, parallel_size); +} diff --git a/src/kernels/MIOpenReduceExtreme.cpp b/src/kernels/MIOpenReduceExtreme.cpp new file mode 100644 index 0000000000..a75e15c2ac --- /dev/null +++ b/src/kernels/MIOpenReduceExtreme.cpp @@ -0,0 +1,72 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "MIOpenReduceExtreme.hpp" + +template +__device__ void extremefwdcontiguous(const TI* __restrict__ x, + TO* __restrict__ y, + int32_t* __restrict__ indice, + uint64_t output_numel, + int32_t reduce_size, + uint64_t inner_size) +{ + const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + if(gid >= output_numel) + return; + + uint64_t input_idx = (gid / inner_size) * inner_size * reduce_size + gid % inner_size; + + int32_t extreme_idx = 0; + FLOAT_ACCUM extreme = CVT_FLOAT2ACCUM(x[input_idx]); + + for(int32_t k = 1; k < reduce_size; ++k) + { + input_idx += inner_size; + FLOAT_ACCUM val = CVT_FLOAT2ACCUM(x[input_idx]); + reduce_func{}.calculate(extreme, val, extreme_idx, k); + } + if(y) + y[gid] = CVT_ACCUM2FLOAT(extreme); + indice[gid] = extreme_idx; +} + +extern "C" __global__ void ExtremeFwdContiguous(const INPUT_TYPE* __restrict__ x, + OUTPUT_TYPE* __restrict__ y, + int32_t* __restrict__ indice, + uint64_t output_numel, + int32_t reduce_size, + uint64_t inner_size) +{ + // instantiate the kernel + extremefwdcontiguous( + x, y, indice, output_numel, reduce_size, inner_size); +} diff --git a/src/kernels/MIOpenReduceExtreme.hpp b/src/kernels/MIOpenReduceExtreme.hpp new file mode 100644 index 0000000000..b53e820475 --- /dev/null +++ b/src/kernels/MIOpenReduceExtreme.hpp @@ -0,0 +1,103 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_KERNELS_MIOPENREDUCEEXTREME_HPP +#define GUARD_KERNELS_MIOPENREDUCEEXTREME_HPP + +enum class ReduceExtremeOp_t +{ + Argmin = 1, + Argmax, + Min, + Max, + First_ = Argmin, + Last_ = Max, +}; + +#ifndef __HIP_DEVICE_COMPILE__ +static_assert(MIOPEN_REDUCE_EXTREME_ARGMIN == static_cast(ReduceExtremeOp_t::Argmin)); +static_assert(MIOPEN_REDUCE_EXTREME_ARGMAX == static_cast(ReduceExtremeOp_t::Argmax)); +static_assert(MIOPEN_REDUCE_EXTREME_MIN == static_cast(ReduceExtremeOp_t::Min)); +static_assert(MIOPEN_REDUCE_EXTREME_MAX == static_cast(ReduceExtremeOp_t::Max)); +#endif + +template +struct reduce_func +{ + inline constexpr void calculate(T1& a, T1 b, T2& c, T2 d) const; +}; + +template +struct reduce_func +{ + inline constexpr void calculate(T1& a, T1 b, T2& c, T2 d) const + { + if(a < b) + { + a = b; + c = d; + } + } +}; + +template +struct reduce_func +{ + inline constexpr void calculate(T1& a, T1 b, T2& c, T2 d) const + { + if(a > b) + { + a = b; + c = d; + } + } +}; + +template +struct reduce_func +{ + inline constexpr void calculate(T1& a, T1 b, T2& c, T2 d) const + { + if(a < b) + { + a = b; + c = d; + } + } +}; + +template +struct reduce_func +{ + inline constexpr void calculate(T1& a, T1 b, T2& c, T2 d) const + { + if(a > b) + { + a = b; + c = d; + } + } +}; +#endif // GUARD_GUARD_KERNELS_MIOPENREDUCEEXTREME_HPP diff --git a/src/kernels/MIOpenSum.cpp b/src/kernels/MIOpenSum.cpp index 6e005d60dc..28a0326873 100644 --- a/src/kernels/MIOpenSum.cpp +++ b/src/kernels/MIOpenSum.cpp @@ -30,14 +30,6 @@ #include "float_types.h" -#if MIOPEN_USE_BFP16 == 1 -#define CVT_FLOAT2ACCUM(x) (bfloat16_to_float(x)) -#define CVT_ACCUM2FLOAT(x) (float_to_bfloat16(x)) -#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x)) -#define CVT_FP32_2FLOAT(x) (CVT_ACCUM2FLOAT(x)) -#define CVT_FP32_2ACCUM(x) (x) -#endif - extern "C" __global__ void SumParallelFwdContiguous(const FLOAT* __restrict__ x, FLOAT* __restrict__ y, uint64_t output_numel, diff --git a/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c16_stride1.s b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c16_stride1.s new file mode 100644 index 0000000000..239931f6d5 --- /dev/null +++ b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c16_stride1.s @@ -0,0 +1,42 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +.include "Conv_Winograd_Fury_v2_4_1_metadata.inc" + +.if (.amdgcn.gfx_generation_number == 11) + .if ((.amdgcn.gfx_generation_minor == 0 && (.amdgcn.gfx_generation_stepping == 0 || .amdgcn.gfx_generation_stepping == 1)) || (.amdgcn.gfx_generation_minor == 5 && .amdgcn.gfx_generation_stepping == 1)) + // gfx1100, gfx1101, gfx1151 + KERNEL_PROLOG _1536vgprs_fp16_fp16acc_f2x3_c16_stride1 + .include "Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc" + KERNEL_EPILOG _1536vgprs_fp16_fp16acc_f2x3_c16_stride1 + .else + // gfx1102, gfx1103, gfx1150 + KERNEL_PROLOG _1024vgprs_fp16_fp16acc_f2x3_c16_stride1 + .include "Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc" + KERNEL_EPILOG _1024vgprs_fp16_fp16acc_f2x3_c16_stride1 + .endif +.else + .error "Unsupported gfx generation" +.endif diff --git a/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c32_stride1.s b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c32_stride1.s new file mode 100644 index 0000000000..0995c95da7 --- /dev/null +++ b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_fp16_fp16acc_f2x3_c32_stride1.s @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +.include "Conv_Winograd_Fury_v2_4_1_metadata.inc" + +.if (.amdgcn.gfx_generation_number == 11) + .if ((.amdgcn.gfx_generation_minor == 0 && (.amdgcn.gfx_generation_stepping == 0 || .amdgcn.gfx_generation_stepping == 1)) || (.amdgcn.gfx_generation_minor == 5 && .amdgcn.gfx_generation_stepping == 1)) + // gfx1100, gfx1101, gfx1151 + KERNEL_PROLOG _1536vgprs_fp16_fp16acc_f2x3_c32_stride1 + .include "Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc" + KERNEL_EPILOG _1536vgprs_fp16_fp16acc_f2x3_c32_stride1 + .else + // gfx1102, gfx1103, gfx1150 + .error "Unsupported gpu" + .endif +.else + .error "Unsupported gfx generation" +.endif diff --git a/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc new file mode 100644 index 0000000000..724d25137d --- /dev/null +++ b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc @@ -0,0 +1,4346 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +.macro _sop1_lit op:req, sdst:req, lit:req + .long (0b101111101 << 23) | (\sdst << 16) | (\op << 8) | 255 + .long \lit +.endm + +.macro _s_mov_b32__sop1_lit sdst:req, lit:req + _sop1_lit 0, \sdst, \lit +.endm + +.macro _vop1 op:req, vdst:req, src:req + .long (0b0111111 << 25) | (\vdst << 17) | (\op << 9) | \src +.endm + +.macro _v_cvt_f16_i16__vop1 vdst:req, vsrc:req + _vop1 81, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _v_rcp_f16__vop1 vdst:req, vsrc:req + _vop1 84, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _v_exp_f16__vop1 vdst:req, vsrc:req + _vop1 88, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _vop3 op:req, vdst:req, src0:req, src1:req, src2:req, opsel:req, abs:req, neg:req + .long (0b110101 << 26) | (\op << 16) | (\opsel << 11) | (\abs << 8) | \vdst + .long (\neg << 29) | (\src2 << 18) | (\src1 << 9) | \src0 +.endm + +.macro _vop3_lit op:req, vdst:req, src0:req, src1:req, src2:req, lit:req, opsel:req, abs:req, neg:req + .long (0b110101 << 26) | (\op << 16) | (\opsel << 11) | (\abs << 8) | \vdst + .long (\neg << 29) | (\src2 << 18) | (\src1 << 9) | \src0 + .long \lit +.endm + +.macro _v_cvt_f16_i16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 465, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_rcp_f16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 468, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_exp_f16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 472, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_cndmask_b16__vop3 vdst:req, vsrc0:req, vsrc1:req, src2:req, opsel:req + _vop3 605, \vdst, (\vsrc0 + /*VGPR*/ 256), (\vsrc1 + /*VGPR*/ 256), \src2, \opsel, 0, 0 +.endm + +.macro _v_cmp_gt_f16__vop3_s_lit sdst:req, ssrc0:req, lit:req, opsel:req, abs:req + _vop3_lit 4, \sdst, \ssrc0, 255, 0, \lit, \opsel, \abs, 0 +.endm + +.macro _v_cmp_gt_f16__vop3_v_lit sdst:req, vsrc0:req, lit:req, opsel:req, abs:req + _vop3_lit 4, \sdst, (\vsrc0 + /*VGPR*/ 256), 255, 0, \lit, \opsel, \abs, 0 +.endm + +.macro _v_cmp_lt_u16__vop3 sdst:req, vsrc0:req, ssrc1:req, opsel:req + _vop3 57, \sdst, (\vsrc0 + /*VGPR*/ 256), \ssrc1, 0, \opsel, 0, 0 +.endm + +.macro _v_cmpx_lt_u32__vop3 sdst:req, vsrc0:req, ssrc1:req + _vop3 201, \sdst, (\vsrc0 + /*VGPR*/ 256), \ssrc1, 0, 0, 0, 0 +.endm + +.macro _vop3p op:req, vdst:req, src0:req, src1:req, src2:req, opsel:req, opsel_hi:req, opsel_hi2:req, neg:req, neg_hi:req + .long (0b11001100 << 24) | (\op << 16) | (\opsel_hi2 << 14) | (\opsel << 11) | (\neg_hi << 8) | \vdst + .long (\neg << 29) | (\opsel_hi << 27) | (\src2 << 18) | (\src1 << 9) | \src0 +.endm + +.macro _vop3p_lit op:req, vdst:req, src0:req, src1:req, src2:req, lit:req, opsel:req, opsel_hi:req, opsel_hi2:req, neg:req, neg_hi:req + .long (0b11001100 << 24) | (\op << 16) | (\opsel_hi2 << 14) | (\opsel << 11) | (\neg_hi << 8) | \vdst + .long (\neg << 29) | (\opsel_hi << 27) | (\src2 << 18) | (\src1 << 9) | \src0 + .long \lit +.endm + +.macro _v_pk_ashrrev_i16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 6, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 10, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_sub_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 11, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_min_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 13, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 15, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_f16__vop3p_lit vdst:req, lit:req, src1:req, opsel:req, opsel_hi:req + _vop3p_lit 15, \vdst, 255, \src1, 0, \lit, \opsel, \opsel_hi, 0, 0, 0 +.endm + +.macro _v_pk_mul_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 16, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_mul_f16__vop3p_lit vdst:req, lit:req, src1:req, opsel:req, opsel_hi:req + _vop3p_lit 16, \vdst, 255, \src1, 0, \lit, \opsel, \opsel_hi, 0, 0, 0 +.endm + +.macro _v_pk_min_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 17, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_max_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 18, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +s_version 0x2006 +s_set_inst_prefetch_distance 0x3 +s_mov_b32 s0, 0 +v_lshlrev_b32 v1, 7, v0 +s_getpc_b64 s[8:9] +s_mov_b32 s10, 0x5ccc +s_mov_b32 s11, 0x31014000 +buffer_load_b32 v2, v1, s[8:11], 0 offen +s_waitcnt vmcnt(0) +s_getpc_b64 s[6:7] +s_load_b512 s[8:23], s[2:3], null +s_load_b512 s[24:39], s[2:3], 0x40 +s_load_b512 s[40:55], s[2:3], 0x80 +s_load_b256 s[56:63], s[2:3], 0xc0 +s_load_b64 s[64:65], s[2:3], 0xe0 +v_and_b32 v8, 0xff, v0 +v_lshrrev_b32 v9, 1, v8 +v_and_b32 v10, 1, v0 +v_add_nc_u32 v5, v9, 32 +v_bfi_b32 v6, 31, v8, v9 +v_bfe_u32 v4, v8, 5, 1 +v_bfi_b32 v6, 0xbf, v6, v5 +v_and_b32 v2, 31, v8 +v_lshrrev_b32 v6, 5, v6 +v_lshrrev_b32 v7, 6, v8 +v_lshlrev_b32 v2, 4, v2 +v_and_b32 v3, 31, v9 +v_mad_u32_u24 v2, v4, 0x900, v2 +v_lshlrev_b32 v3, 4, v3 +v_xor_b32 v5, 3, v6 +v_mad_u32_u16 v3, 0x480, v7, v3 op_sel:[0,0,0,0] +v_mad_u32_u24 v1, v5, 0x240, v2 +v_mad_u32_u16 v3, 0x1240, v10, v3 op_sel:[0,0,0,0] +v_mad_u32_u24 v2, v6, 0x240, v2 +s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) +s_bitcmp1_b32 s14, 6 +s_cbranch_scc0 14 +s_load_b64 s[16:17], s[16:17], null +s_load_b64 s[20:21], s[20:21], null +s_load_b64 s[18:19], s[18:19], null +s_cmp_eq_u64 0, s[60:61] +s_cbranch_scc1 2 +s_load_b64 s[60:61], s[60:61], null +s_cmp_eq_u64 0, s[30:31] +s_cbranch_scc1 2 +s_load_b64 s[30:31], s[30:31], null +s_bitcmp1_b32 s14, 3 +s_cbranch_scc0 2 +s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xf0 +s_cmp_eq_u32 s13, 0x60 +s_cbranch_scc0 16 +s_mul_i32 s1, s4, 0xab +s_lshr_b32 s1, s1, 10 +s_mul_i32 s23, s1, 6 +s_sub_u32 s23, s4, s23 +s_bfe_u32 s15, s1, 0x20000 +s_bfe_u32 s22, s1, 0x10002 +s_bfe_u32 s5, s1, 0x10003 +s_mov_b32 s45, s23 +s_lshl1_add_u32 s45, s45, s22 +s_lshl2_add_u32 s45, s45, s15 +s_lshl1_add_u32 s45, s45, s5 +s_mov_b32 s4, s45 +s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) +s_bitcmp1_b32 s14, 13 +s_cbranch_scc0 10 +s_add_u32 s16, s16, s34 +s_addc_u32 s17, s17, s35 +s_add_u32 s20, s20, s38 +s_addc_u32 s21, s21, s39 +s_add_u32 s18, s18, s36 +s_addc_u32 s19, s19, s37 +s_cmp_eq_u64 0, s[30:31] +s_cselect_b64 s[40:41], 0, s[40:41] +s_add_u32 s30, s30, s40 +s_addc_u32 s31, s31, s41 +s_add_u32 s15, s12, 15 +s_lshr_b32 s15, s15, 4 +v_cvt_f32_u32 v4, s15 +v_rcp_f32 v4, v4 +v_mul_f32 v4, 0x47800000, v4 +v_cvt_floor_i32_f32 v4, v4 +v_mad_u32_u24 v5, v4, s13, s13 +v_lshrrev_b32 v5, 16, v5 +v_cvt_f32_u32 v4, v5 +v_rcp_f32 v4, v4 +v_mul_f32 v4, 0x47800000, v4 +v_cvt_floor_i32_f32 v4, v4 +v_mad_u32_u24 v6, v4, s4, s4 +v_lshrrev_b32 v6, 16, v6 +v_readfirstlane_b32 s1, v5 +v_readfirstlane_b32 s22, v6 +s_mul_i32 s5, s22, s1 +s_sub_u32 s5, s4, s5 +s_cmp_ge_u32 s22, s15 +s_cbranch_scc1 5809 +s_mul_i32 s13, s1, s15 +s_mul_i32 s23, s22, 16 +s_sub_u32 s12, s12, s23 +s_min_u32 s12, s12, 16 +s_mul_i32 s34, s23, s46 +s_mul_hi_u32 s35, s23, s46 +s_lshl_b64 s[34:35], s[34:35], 1 +s_add_u32 s18, s34, s18 +s_addc_u32 s19, s35, s19 +s_lshr_b32 s35, s23, 0 +s_mul_i32 s34, s35, s51 +s_mul_hi_u32 s35, s35, s51 +s_lshl_b64 s[34:35], s[34:35], 1 +s_add_u32 s20, s34, s20 +s_addc_u32 s21, s35, s21 +s_lshl_b32 s34, s23, 1 +s_cmp_eq_u64 s[30:31], 0 +s_cselect_b32 s34, 0, s34 +s_add_u32 s30, s30, s34 +s_addc_u32 s31, s31, 0 +v_cmp_lt_u32 vcc, v0, 0x100 +s_cbranch_vccz 2749 +v_and_b32 v20, 0xff, v0 +v_lshrrev_b32 v21, 1, v20 +v_bfe_u32 v17, v20, 3, 1 +v_bfe_u32 v16, v20, 2, 1 +v_mad_u32_u16 v17, v17, 16, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v14, v16, 0x1240, v17 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 0, 2 +v_mad_u32_u16 v14, v16, 0x90, v14 op_sel:[0,0,0,0] +v_bfe_u32 v17, v20, 4, 2 +v_mad_u32_u16 v14, v17, 32, v14 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 6, 1 +v_mad_u32_u16 v14, v16, 0x480, v14 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 7, 1 +v_mad_u32_u16 v14, v16, 0x900, v14 op_sel:[0,0,0,0] +v_bfe_u32 v18, v20, 1, 2 +v_mad_u32_u16 v13, v18, 32, 0 op_sel:[0,0,0,0] +v_bfe_u32 v19, v20, 3, 1 +v_mad_u32_u16 v13, v19, 0x480, v13 op_sel:[0,0,0,0] +v_add_nc_u32 v18, v21, 32 +v_bfi_b32 v18, 0xbf, v20, v18 +v_bfe_u32 v18, v18, 6, 2 +v_mad_u32_u16 v13, v18, 0x90, v13 op_sel:[0,0,0,0] +v_xor_b32 v16, v0, v0 quad_perm:[2,3,2,1] +v_xor_b32 v17, v0, v0 quad_perm:[0,0,3,3] +v_sub_nc_u16 v16, v16, v17 op_sel:[0,0,0] +v_cvt_f16_i16 v15, v16 +_v_cvt_f16_i16__vop1 (15 | /*op_sel*/ 0x80), 17 +_v_pk_mul_f16__vop3p 15, 271, 240, 0x0, 0x1, 0x0, 0x0 +v_bfe_u32 v16, v0, 6, 1 +v_and_b32 v5, 63, v0 +v_cmp_eq_u32 vcc, v16, 1 +v_cndmask_b32 v16, 0, 0x400, vcc +v_cndmask_b32 v17, 0, 0x100, vcc +v_lshl_add_u32 v6, v5, 2, 0 +v_lshl_add_u32 v5, v5, 4, v16 +s_mov_b32 s23, 4 +s_mov_b32 s34, 0 +s_mov_b32 s40, 0xbc00c000 +v_readfirstlane_b32 s74, v0 +s_and_b32 null, 64, s74 +s_cmov_b32 s40, 0x3c00c000 +s_lshl_b32 s49, s43, 1 +s_lshl_b32 s53, s47, 1 +s_lshl_b32 s75, s49, 3 +s_lshl_b32 s76, s53, 3 +s_and_b32 null, 0x80, s74 +s_cselect_b32 s75, s75, 0 +s_cselect_b32 s76, s76, 0 +s_cselect_b32 s22, 8, 0 +s_sub_u32 s22, s9, s22 +s_cmov_b32 s22, 0 +s_mov_b32 s35, 0x11014000 +s_bitcmp1_b32 s14, 4 +s_cselect_b32 s77, 0, 0x8000000 +s_and_b32 s35, 0xf7ffffff, s35 +s_or_b32 s35, s35, s77 +s_and_b32 s17, s17, 0xffff +s_add_u32 s17, s17, 0x20000 +s_and_b32 s19, s19, 0xffff +s_add_u32 s19, s19, 0x20000 +s_add_u32 s16, s16, s75 +s_addc_u32 s17, s17, 0 +s_add_u32 s18, s18, s76 +s_addc_u32 s19, s19, 0 +s_mov_b64 s[36:37], s[16:17] +s_mov_b32 s38, 0x80000000 +s_mov_b32 s39, 0 +s_getpc_b64 s[64:65] +v_cmp_lt_u32 vcc, v0, 0x80 +s_cmp_gt_u32 vcc_lo, 0 +s_mov_b32 s74, 0x23d8 +s_mov_b32 s76, 0x1a58 +s_cmov_b32 s74, 0x1e98 +s_cmov_b32 s76, 0x1618 +s_mov_b32 s75, 0x2654 +s_mov_b32 s77, 0x1c54 +s_cmov_b32 s75, 0x2114 +s_cmov_b32 s77, 0x1814 +s_add_u32 s66, s64, s74 +s_addc_u32 s67, s65, 0 +s_add_u32 s70, s64, s76 +s_addc_u32 s71, s65, 0 +s_add_u32 s68, s64, s75 +s_addc_u32 s69, s65, 0 +s_add_u32 s72, s64, s77 +s_addc_u32 s73, s65, 0 +s_mov_b32 s45, 0 +v_mov_b32 v4, 0 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 272, 273, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 308, 341, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 360, 377, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 396, 397, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v16, s40, v34 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v52, s40, v86 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v104, s40, v122 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v140, s40, v142 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5561 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 273, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 341, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 377, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 397, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v17, v160 quad_perm:[1,0,3,2] +v_mov_b32 v85, v161 quad_perm:[1,0,3,2] +v_mov_b32 v121, v162 quad_perm:[1,0,3,2] +v_mov_b32 v141, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v121, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v141, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v17, v160 quad_perm:[2,3,0,1] +v_mov_b32 v85, v161 quad_perm:[2,3,0,1] +v_mov_b32 v121, v162 quad_perm:[2,3,0,1] +v_mov_b32 v141, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v121, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v141, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5462 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v160, v34, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v86, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v122, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v142, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 290, 291, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 342, 343, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 378, 379, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 398, 399, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5388 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 290, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 342, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 378, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 398, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v34, v160 quad_perm:[1,0,3,2] +v_mov_b32 v86, v161 quad_perm:[1,0,3,2] +v_mov_b32 v122, v162 quad_perm:[1,0,3,2] +v_mov_b32 v142, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v34, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v86, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v34, v160 quad_perm:[2,3,0,1] +v_mov_b32 v86, v161 quad_perm:[2,3,0,1] +v_mov_b32 v122, v162 quad_perm:[2,3,0,1] +v_mov_b32 v142, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v34, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v86, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5290 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 403, 402, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 407, 406, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 411, 410, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 415, 414, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v147, s40, v144 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v151, s40, v148 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v155, s40, v152 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v159, s40, v156 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5215 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 402, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 406, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 410, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 414, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v146, v160 quad_perm:[1,0,3,2] +v_mov_b32 v150, v161 quad_perm:[1,0,3,2] +v_mov_b32 v154, v162 quad_perm:[1,0,3,2] +v_mov_b32 v158, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v154, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v158, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v146, v160 quad_perm:[2,3,0,1] +v_mov_b32 v150, v161 quad_perm:[2,3,0,1] +v_mov_b32 v154, v162 quad_perm:[2,3,0,1] +v_mov_b32 v158, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v154, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v158, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5116 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v160, v144, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v148, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v152, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v156, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 400, 401, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 404, 405, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 408, 409, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 412, 413, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5042 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 400, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 404, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 408, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 412, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v144, v160 quad_perm:[1,0,3,2] +v_mov_b32 v148, v161 quad_perm:[1,0,3,2] +v_mov_b32 v152, v162 quad_perm:[1,0,3,2] +v_mov_b32 v156, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v144, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v148, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v144, v160 quad_perm:[2,3,0,1] +v_mov_b32 v148, v161 quad_perm:[2,3,0,1] +v_mov_b32 v152, v162 quad_perm:[2,3,0,1] +v_mov_b32 v156, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v144, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v148, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4944 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 272, 273, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 308, 341, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 291, 290, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 343, 342, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v16, s40, v121 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v52, s40, v141 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v35, s40, v122 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v87, s40, v142 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4869 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 273, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 341, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 290, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 342, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v17, v160 quad_perm:[1,0,3,2] +v_mov_b32 v85, v161 quad_perm:[1,0,3,2] +v_mov_b32 v34, v162 quad_perm:[1,0,3,2] +v_mov_b32 v86, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v34, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v86, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v17, v160 quad_perm:[2,3,0,1] +v_mov_b32 v85, v161 quad_perm:[2,3,0,1] +v_mov_b32 v34, v162 quad_perm:[2,3,0,1] +v_mov_b32 v86, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v34, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v86, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4770 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v160, v121, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v141, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v122, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v142, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 377, 360, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 397, 396, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 378, 379, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 398, 399, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4696 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 377, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 397, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 378, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 398, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v121, v160 quad_perm:[1,0,3,2] +v_mov_b32 v141, v161 quad_perm:[1,0,3,2] +v_mov_b32 v122, v162 quad_perm:[1,0,3,2] +v_mov_b32 v142, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v121, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v141, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v121, v160 quad_perm:[2,3,0,1] +v_mov_b32 v141, v161 quad_perm:[2,3,0,1] +v_mov_b32 v122, v162 quad_perm:[2,3,0,1] +v_mov_b32 v142, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v121, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v141, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4598 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 403, 402, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 407, 406, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 401, 400, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 405, 404, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v147, s40, v154 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v151, s40, v158 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v145, s40, v152 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v149, s40, v156 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4523 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 402, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 406, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 400, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 404, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v146, v160 quad_perm:[1,0,3,2] +v_mov_b32 v150, v161 quad_perm:[1,0,3,2] +v_mov_b32 v144, v162 quad_perm:[1,0,3,2] +v_mov_b32 v148, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v144, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v148, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v146, v160 quad_perm:[2,3,0,1] +v_mov_b32 v150, v161 quad_perm:[2,3,0,1] +v_mov_b32 v144, v162 quad_perm:[2,3,0,1] +v_mov_b32 v148, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v144, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v148, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4424 +s_mov_b32 s56, 0xffffebec +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0xffffed18 +s_setprio 2 +v_pk_fma_f16 v160, v154, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v158, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v152, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v156, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 410, 411, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 414, 415, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 408, 409, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 412, 413, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4350 +s_mov_b32 s56, 0xffffea64 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0xffffeb90 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 410, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 414, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 408, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 412, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v154, v160 quad_perm:[1,0,3,2] +v_mov_b32 v158, v161 quad_perm:[1,0,3,2] +v_mov_b32 v152, v162 quad_perm:[1,0,3,2] +v_mov_b32 v156, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v154, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v158, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v154, v160 quad_perm:[2,3,0,1] +v_mov_b32 v158, v161 quad_perm:[2,3,0,1] +v_mov_b32 v152, v162 quad_perm:[2,3,0,1] +v_mov_b32 v156, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v154, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v158, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4252 +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s41, v4 +v_mov_b32 v69, v36 +v_mov_b32 v70, v37 +v_mov_b32 v71, v38 +v_mov_b32 v72, v39 +v_mov_b32 v73, v40 +v_mov_b32 v74, v41 +v_mov_b32 v75, v42 +v_mov_b32 v76, v43 +_v_pk_add_f16__vop3p 88, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 89, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 90, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 91, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 92, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 93, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 94, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 95, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 88, 344, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 89, 345, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 90, 346, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 91, 347, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 92, 348, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 93, 349, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 94, 350, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 95, 351, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v88, v44, 0.5, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v89, v45, 0.5, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v90, v46, 0.5, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v91, v47, 0.5, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v92, v48, 0.5, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v93, v49, 0.5, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v94, v50, 0.5, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v95, v51, 0.5, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v105, v44, -1.0, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v106, v45, -1.0, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v107, v46, -1.0, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v108, v47, -1.0, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v109, v48, -1.0, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v110, v49, -1.0, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v111, v50, -1.0, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v112, v51, -1.0, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v124, v61 +v_mov_b32 v125, v62 +v_mov_b32 v126, v63 +v_mov_b32 v127, v64 +v_mov_b32 v128, v65 +v_mov_b32 v129, v66 +v_mov_b32 v130, v67 +v_mov_b32 v131, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v77, v36 +v_mov_b32 v78, v37 +v_mov_b32 v79, v38 +v_mov_b32 v80, v39 +v_mov_b32 v81, v40 +v_mov_b32 v82, v41 +v_mov_b32 v83, v42 +v_mov_b32 v84, v43 +_v_pk_add_f16__vop3p 96, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 97, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 98, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 99, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 100, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 101, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 102, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 103, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 96, 352, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 97, 353, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 98, 354, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 99, 355, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 100, 356, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 101, 357, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 102, 358, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 103, 359, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v96, v44, 0.5, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v97, v45, 0.5, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v98, v46, 0.5, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v99, v47, 0.5, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v100, v48, 0.5, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v101, v49, 0.5, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v102, v50, 0.5, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v103, v51, 0.5, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v113, v44, -1.0, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v114, v45, -1.0, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v115, v46, -1.0, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v116, v47, -1.0, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v117, v48, -1.0, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v118, v49, -1.0, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v119, v50, -1.0, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v120, v51, -1.0, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v132, v61 +v_mov_b32 v133, v62 +v_mov_b32 v134, v63 +v_mov_b32 v135, v64 +v_mov_b32 v136, v65 +v_mov_b32 v137, v66 +v_mov_b32 v138, v67 +v_mov_b32 v139, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s41, v4 +v_mov_b32 v69, v36 +v_mov_b32 v70, v37 +v_mov_b32 v71, v38 +v_mov_b32 v72, v39 +v_mov_b32 v73, v40 +v_mov_b32 v74, v41 +v_mov_b32 v75, v42 +v_mov_b32 v76, v43 +_v_pk_add_f16__vop3p 88, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 89, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 90, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 91, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 92, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 93, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 94, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 95, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 88, 344, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 89, 345, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 90, 346, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 91, 347, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 92, 348, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 93, 349, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 94, 350, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 95, 351, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v88, v44, 0.5, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v89, v45, 0.5, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v90, v46, 0.5, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v91, v47, 0.5, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v92, v48, 0.5, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v93, v49, 0.5, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v94, v50, 0.5, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v95, v51, 0.5, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v105, v44, -1.0, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v106, v45, -1.0, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v107, v46, -1.0, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v108, v47, -1.0, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v109, v48, -1.0, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v110, v49, -1.0, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v111, v50, -1.0, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v112, v51, -1.0, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v124, v61 +v_mov_b32 v125, v62 +v_mov_b32 v126, v63 +v_mov_b32 v127, v64 +v_mov_b32 v128, v65 +v_mov_b32 v129, v66 +v_mov_b32 v130, v67 +v_mov_b32 v131, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_mov_b32 v77, v36 +v_mov_b32 v78, v37 +v_mov_b32 v79, v38 +v_mov_b32 v80, v39 +v_mov_b32 v81, v40 +v_mov_b32 v82, v41 +v_mov_b32 v83, v42 +v_mov_b32 v84, v43 +_v_pk_add_f16__vop3p 96, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 97, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 98, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 99, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 100, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 101, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 102, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 103, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 96, 352, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 97, 353, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 98, 354, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 99, 355, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 100, 356, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 101, 357, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 102, 358, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 103, 359, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v96, v44, 0.5, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v97, v45, 0.5, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v98, v46, 0.5, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v99, v47, 0.5, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v100, v48, 0.5, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v101, v49, 0.5, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v102, v50, 0.5, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v103, v51, 0.5, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v113, v44, -1.0, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v114, v45, -1.0, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v115, v46, -1.0, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v116, v47, -1.0, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v117, v48, -1.0, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v118, v49, -1.0, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v119, v50, -1.0, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v120, v51, -1.0, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v132, v61 +v_mov_b32 v133, v62 +v_mov_b32 v134, v63 +v_mov_b32 v135, v64 +v_mov_b32 v136, v65 +v_mov_b32 v137, v66 +v_mov_b32 v138, v67 +v_mov_b32 v139, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s41, v4 +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s41, v4 +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +v_bfe_u32 v21, v0, 6, 1 +v_and_b32 v16, 63, v0 +v_cmp_eq_u32 vcc, v21, 1 +v_cndmask_b32 v23, 0, 0x800, vcc +v_cndmask_b32 v21, 0, 0x400, vcc +v_cndmask_b32 v22, 0, 0x100, vcc +v_lshl_add_u32 v14, v16, 3, v23 +v_lshl_add_u32 v17, v16, 2, v22 +v_lshl_add_u32 v18, v16, 2, 0 +v_lshl_add_u32 v16, v16, 4, v21 +s_cmp_eq_u64 s[30:31], 0 +s_cselect_b32 s91, 0, 0x11014000 +s_and_b32 s31, s31, 0xffff +s_add_u32 s31, s31, 0x20000 +s_mov_b64 s[88:89], s[30:31] +s_mov_b32 s90, 0x80000000 +v_and_b32 v21, v0, 63 +v_lshlrev_b32 v21, 1, v21 +v_cmp_lt_u32 vcc, v21, s12 +v_add_nc_u32 v22, v21, 1 +v_cndmask_b32 v21, 0x80000000, v21, vcc +v_cmp_lt_u32 vcc, v22, s12 +v_cndmask_b32 v22, 0x80000000, v22, vcc +buffer_load_d16_b16 v23, v21, s[88:91], 0 idxen +buffer_load_d16_hi_b16 v23, v22, s[88:91], 0 idxen +s_waitcnt vmcnt(0) +v_readlane_b32 s56, v23, 0 +v_readlane_b32 s57, v23, 1 +v_readlane_b32 s59, v23, 2 +v_readlane_b32 s64, v23, 3 +v_readlane_b32 s65, v23, 4 +v_readlane_b32 s66, v23, 5 +v_readlane_b32 s67, v23, 6 +v_readlane_b32 s68, v23, 7 +s_bfe_u32 s88, s58, 0x80000 +s_cmp_eq_u32 s88, 2 +s_cbranch_scc1 20 +s_cmp_eq_u32 s88, 0 +s_cselect_b32 s32, 1.0, s32 +v_cvt_f16_f32 v21, s32 +v_readfirstlane_b32 s32, v21 +v_cvt_f16_f32 v21, s33 +v_readfirstlane_b32 s33, v21 +_v_cmp_gt_f16__vop3_s_lit 106, 32, 0x3c00, 0x0, 0x0 +s_pack_ll_b32_b16 s32, s32, s32 +s_pack_ll_b32_b16 s33, s33, s33 +s_cmp_eq_u32 s88, 3 +s_cbranch_scc1 10 +s_cbranch_vccnz 3 +s_mov_b32 s84, 0x424c +s_branch 8 +s_mov_b32 s84, 0x45e4 +s_branch 5 +s_mov_b32 s84, 0x497c +s_branch 2 +s_mov_b32 s84, 0x4f94 +s_add_u32 s86, s6, 0x3c90 +s_addc_u32 s87, s7, 0 +s_mov_b32 s82, 0xbc00c000 +s_mov_b32 s40, 0x10000 +s_mov_b32 s41, 0x30002 +s_mov_b32 s45, 0x10000 +v_readfirstlane_b32 s88, v0 +s_and_b32 null, 64, s88 +s_cmov_b32 s82, 0x3c00c000 +s_cmov_b32 s40, 0x20003 +s_cmov_b32 s41, 1 +s_cmov_b32 s45, 1 +s_and_b32 s21, s21, 0xffff +s_add_u32 s21, s21, 0x20000 +s_lshl_b32 s80, s51, 1 +s_lshl_b32 s81, s52, 1 +s_mov_b64 s[72:73], s[20:21] +s_mov_b32 s74, 0x80000000 +s_mov_b32 s75, 0 +s_sub_u32 s89, s25, 1 +s_bitcmp1_b32 s14, 1 +s_cselect_b32 s89, s89, 0 +s_cselect_b32 s88, -1, 1 +s_sub_u32 s91, s24, 1 +s_bitcmp1_b32 s14, 0 +s_cselect_b32 s91, s91, 0 +s_cselect_b32 s90, -1, 1 +v_bfe_u32 v24, v0, 6, 1 +v_bfe_u32 v25, v0, 4, 1 +v_bfe_u32 v21, v0, 5, 1 +v_lshl_add_u32 v24, v24, 2, 0 +v_lshl_add_u32 v25, v25, 3, v24 +v_bfe_u32 v23, v0, 2, 2 +v_bfe_u32 v24, v0, 3, 1 +v_xor_b32 v22, v0, v0 quad_perm:[0,0,3,1] +v_lshl_add_u32 v21, v21, 1, v25 +v_xor_b32 v23, v23, v24 +v_add_nc_u32 v24, v21, 1 +v_mad_i32_i16 v19, v23, s88, s89 op_sel:[0,0,0,0] +v_mad_i32_i16 v25, v22, s90, s91 op_sel:[0,0,0,0] +v_mad_u32_u16 v19, v25, s48, v19 op_sel:[0,0,0,0] +v_cmp_lt_u32 vcc, v23, s25 +v_cndmask_b32 v19, 0x80000000, v19, vcc +v_cmp_lt_u32 vcc, v22, s24 +v_cndmask_b32 v19, 0x80000000, v19, vcc +v_mad_u32_u24 v20, v24, s46, v19 +v_mad_u32_u24 v19, v21, s46, v19 +v_cmp_lt_u32 vcc, v24, s12 +v_cndmask_b32 v20, 0x80000000, v20, vcc +v_cmp_lt_u32 vcc, v21, s12 +v_cndmask_b32 v19, 0x80000000, v19, vcc +s_add_u32 s89, s28, 1 +s_lshr_b32 s89, s89, 1 +s_lshl_b32 s90, s89, 1 +s_add_u32 s91, s29, 1 +s_lshr_b32 s91, s91, 1 +s_lshl1_add_u32 s91, s91, 2 +s_pack_ll_b32_b16 s22, s91, s89 +s_pack_ll_b32_b16 s34, s11, s10 +s_sub_u32 s35, s90, s26 +s_sub_u32 s88, s91, s27 +s_pack_ll_b32_b16 s35, s88, s35 +s_pack_ll_b32_b16 s37, s29, s28 +s_sub_u32 s88, s91, 1 +s_pack_ll_b32_b16 s38, s88, s90 +v_lshrrev_b32 v24, 16, s22 +v_bfi_b32 v25, 0xffff, s22, 0 +v_and_b32 v27, 1, v0 +v_bfe_u32 v33, v0, 6, 1 +v_and_b32 v22, 63, v0 +v_mad_u32_u16 v28, 0x7c, s1, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v33, 2, s5, v33 op_sel:[0,0,0,0] +v_mad_u32_u16 v26, v24, v25, 0 op_sel:[0,0,0,0] +v_cmp_eq_u32 vcc, 0, v27 +v_cndmask_b32 v34, v26, v25, vcc +v_mad_u32_u16 v23, 62, v33, v22 op_sel:[0,0,0,0] +v_cndmask_b32 v23, v28, v23, vcc +v_clz_i32_u32 v40, v34 +v_lshlrev_b32 v41, v40, v34 +v_and_b32 v39, 0xffffff00, v41 +v_cmp_eq_u32 vcc, 0x80000000, v41 +v_cvt_f32_u32 v39, v39 +v_rcp_f32 v35, v39 +v_sub_co_ci_u32 v36, vcc, 32, v40, vcc +v_cvt_f32_ubyte0 v40, v41 +v_fma_f32 v39, v39, v35, -1.0 +v_fma_f32 v39, v40, v35, v39 +v_fmaak_f32 v39, v39, v35, 0x9f000000 +v_mul_f32 v39, 0x5f800000, v39 +v_mov_b32 v40, 0 +v_cvt_floor_i32_f32 v39, -v39 +v_lshl_add_u32 v35, v35, 9, v39 +v_mad_u64_u32 v[40:41], vcc, v41, v35, v[40:41] +v_sub_co_ci_u32 v35, vcc, v35, -1, vcc +v_mov_b32 v38, v36 quad_perm:[1,1,1,1] +v_mov_b32 v36, v36 quad_perm:[0,0,0,0] +v_mov_b32 v37, v35 quad_perm:[1,1,1,1] +v_mov_b32 v35, v35 quad_perm:[0,0,0,0] +v_mul_hi_u32 v39, v23, v35 +v_add_co_u32 v21, vcc, v39, v23 +v_add_co_ci_u32 v39, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v36 +v_cndmask_b32 v21, v21, v39, vcc +v_alignbit_b32 v21, v39, v21, v36 +v_mul_hi_u32 v39, v23, v37 +v_add_co_u32 v4, vcc, v39, v23 +v_add_co_ci_u32 v39, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v38 +v_cndmask_b32 v4, v4, v39, vcc +v_alignbit_b32 v4, v39, v4, v38 +v_mad_u32_u16 v32, v21, v25, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v31, v4, v24, 0 op_sel:[0,0,0,0] +v_sub_nc_u32 v32, v23, v32 +v_sub_nc_u32 v31, v21, v31 +v_readlane_b32 s92, v32, 1 +v_sub_nc_u32 v32, v32, v25 +v_readlane_b32 s23, v31, 1 +v_sub_nc_u32 v31, v31, v24 +v_readlane_b32 s15, v4, 1 +v_sub_nc_u32 v4, v4, s8 +s_lshl_b32 s23, s23, 16 +s_and_b32 s92, s92, 0xffff +s_add_u32 s23, s23, s92 +v_mov_b32 v32, v32 quad_perm:[0,0,2,2] +v_mov_b32 v31, v31 quad_perm:[0,0,2,2] +v_mov_b32 v4, v4 quad_perm:[0,0,2,2] +v_add_co_u32 v32, vcc, v32, v27 +v_cndmask_b32 v30, 0, v25, vcc +v_add_co_ci_u32 v31, vcc, v31, 0, vcc +v_cndmask_b32 v29, 0, v24, vcc +v_add_co_ci_u32 v4, vcc, v4, 0, vcc +v_min_u32 v27, v22, 63 +v_sub_nc_u32 v32, v32, v30 +v_sub_nc_u32 v31, v31, v29 +v_cmp_eq_u32 vcc, v22, v27 +v_lshlrev_b32 v5, 16, v31 +v_bfi_b32 v5, 0xffff, v32, v5 +v_add_nc_u32 v42, v4, s8 +v_med3_u32 v27, v22, 1, 62 +v_mul_lo_u32 v6, v42, s42 +v_mul_lo_u32 v11, v42, s50 +s_mul_i32 s36, s15, s42 +s_mul_i32 s39, s15, s50 +v_cndmask_b32 v6, 0x80000000, v6, vcc +v_cmp_eq_u32 vcc, v22, v27 +v_cndmask_b32 v11, 0x80000000, v11, vcc +v_cmp_ge_u32 s[54:55], v42, s8 +v_cndmask_b32 v6, v6, 0x80000000, s[54:55] +v_cndmask_b32 v11, v11, 0x80000000, s[54:55] +s_mov_b32 s49, 3 +s_lshl_b32 s53, s49, 9 +v_add_nc_u32 v15, s53, v14 +s_bfe_u32 s10, s58, 0x80008 +s_bfe_u32 s11, s58, 0x80010 +s_cmp_eq_u32 s11, 0 +s_cmov_b32 s26, 0 +s_cbranch_scc1 108 +s_add_u32 s11, s11, 0xffffff00 +s_add_u32 s60, s60, 0 +s_addc_u32 s61, s61, 0 +s_lshr_b32 s91, s13, 2 +s_or_b32 s91, s91, 0x21010000 +v_cmp_eq_u32 vcc, v0, 0x100 +s_cmp_eq_u64 vcc, 0 +s_cselect_b32 s91, 0, s91 +s_cselect_b32 s90, 0, 0x1010101 +s_sub_u32 s10, 0, s10 +s_mov_b64 s[88:89], s[60:61] +s_and_b32 s89, s89, 0xffff +s_or_b32 s89, s89, 0x40000 +s_and_b32 s29, s22, 0xffff +s_lshr_b32 s28, s22, 16 +s_lshr_b32 s29, s29, 1 +s_mul_i32 s27, s29, s28 +s_mul_i32 s27, s27, s8 +s_add_u32 s27, s27, 61 +v_writelane_b32 v22, 62, 0 +v_writelane_b32 v22, s1, 1 +v_writelane_b32 v22, 5, 2 +v_clz_i32_u32 v26, v22 +v_lshlrev_b32 v27, v26, v22 +v_and_b32 v28, 0xffffff00, v27 +v_cmp_eq_u32 vcc, 0x80000000, v27 +v_cvt_f32_u32 v28, v28 +v_rcp_f32 v24, v28 +v_sub_co_ci_u32 v25, vcc, 32, v26, vcc +v_cvt_f32_ubyte0 v26, v27 +v_fma_f32 v28, v28, v24, -1.0 +v_fma_f32 v28, v26, v24, v28 +v_fmaak_f32 v28, v28, v24, 0x9f000000 +v_mul_f32 v28, 0x5f800000, v28 +v_mov_b32 v26, 0 +v_cvt_floor_i32_f32 v28, -v28 +v_lshl_add_u32 v24, v24, 9, v28 +v_mad_u64_u32 v[26:27], vcc, v27, v24, v[26:27] +v_sub_co_ci_u32 v24, vcc, v24, -1, vcc +v_mul_hi_u32 v26, s27, v24 +v_add_co_u32 v23, vcc, v26, s27 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v23, v23, v26, vcc +v_alignbit_b32 v23, v26, v23, v25 +v_mov_b32 v23, v23 quad_perm:[0,0,0,0] +v_mul_hi_u32 v26, v23, v24 +v_add_co_u32 v22, vcc, v26, v23 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v22, v22, v26, vcc +v_alignbit_b32 v22, v26, v22, v25 +v_mov_b32 v22, v22 quad_perm:[1,1,1,1] +v_add_nc_u32 v23, v22, 4 +v_mul_hi_u32 v26, v23, v24 +v_add_co_u32 v23, vcc, v26, v23 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v23, v23, v26, vcc +v_alignbit_b32 v23, v26, v23, v25 +v_readlane_b32 s28, v22, 1 +v_readlane_b32 s29, v23, 2 +s_add_u32 s27, s9, 15 +s_lshr_b32 s27, s27, 4 +s_cmp_eq_u32 s27, 1 +s_cmov_b32 s29, 1 +s_add_u32 s26, s28, s29 +s_mul_i32 s26, s27, s26 +s_add_u32 s26, 6, s26 +s_sub_u32 s26, s26, 1 +s_mov_b32 s92, 0 +s_mov_b32 s93, 0 +s_mov_b32 s94, 0 +s_mov_b32 s95, 0 +s_mov_b32 s96, 0 +s_mov_b32 s97, 0 +s_mov_b32 s28, 0 +s_mov_b32 s27, 8 +s_cmp_gt_u32 s28, 0 +s_cbranch_scc1 4 +v_mov_b32 v58, v4 +v_mov_b32 v63, v5 +v_mov_b32 v145, v6 +v_mov_b32 v146, v11 +v_mov_b32 v4, v58 +v_mov_b32 v5, v63 +v_mov_b32 v6, v145 +v_mov_b32 v11, v146 +s_add_u32 s28, s28, 16 +s_cmp_ge_u32 s28, s9 +s_cmov_b32 s28, 0 +s_cselect_b32 s29, 6, 2 +s_cselect_b32 s98, 9, 0 +s_pack_lh_b32_b16 s29, s29, s27 +s_pack_ll_b32_b16 s98, s98, s28 +v_mov_b32 v144, s29 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v44, v49, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v45, v50, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v46, v51, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v47, v52, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v7, v19 +v_mov_b32 v8, v20 +v_mov_b32 v9, 0x80000000 +v_mov_b32 v10, 0x80000000 +v_mov_b32 v12, 0x80000000 +v_mov_b32 v13, 0x80000000 +s_setprio 0 +ds_load_b128 v[34:37], v3 +ds_store_b128 v16, v[7:10] offset:37120 +ds_load_b128 v[39:42], v3 offset:576 +ds_store_b32 v17, v144 offset:39168 +s_setprio 2 +s_sub_u32 s26, s26, 1 +s_cselect_b32 s91, 0x21010000, s91 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_add_nc_u32 v15, s53, v14 +v_mov_b32 v165, v163 +v_mov_b32 v166, v164 +v_pk_fma_f16 v147, v34, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v148, v35, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v149, v36, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v150, v37, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 34, 285, 290, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 35, 286, 291, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 36, 287, 292, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 288, 293, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 151, 290, 295, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 291, 296, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 292, 297, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 293, 298, 0x0, 0x3, 0x0, 0x0 +s_setprio 0 +ds_load_b64 v[163:164], v15 offset:39680 +ds_load_b128 v[54:57], v3 offset:2304 +ds_load_b128 v[59:62], v3 offset:2880 +s_setprio 2 +s_mov_b32 s92, s93 +s_mov_b32 s93, s94 +s_mov_b32 s94, s95 +s_mov_b32 s95, s96 +s_mov_b32 s96, s97 +s_mov_b32 s97, s27 +s_bitcmp1_b32 s92, 0 +s_cbranch_scc1 2473 +s_sub_u32 s49, s49, 1 +s_cselect_b32 s49, 3, s49 +s_lshl_b32 s53, s49, 9 +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3c94 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s75, 0x11014000, 0 +s_sub_u32 s69, s12, 1 +s_cselect_b32 s75, 0, s75 +s_mov_b64 s[72:73], s[20:21] +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v155, v54, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v156, v55, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v157, v56, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v158, v57, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 54, 305, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 55, 306, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 56, 307, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 57, 308, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 159, 310, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 311, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 312, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 313, 318, 0x0, 0x3, 0x0, 0x0 +s_add_u32 s11, s11, 0x100 +s_cbranch_scc0 7 +s_bitset0_b32 s91, 23 +s_lshl_b64 exec, 1, s90 +buffer_store_b8 v0, off, s[88:91], s4 +s_mov_b64 exec, -1 +s_mul_i32 s11, s11, 0xffffff01 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 3 +s_bitset1_b32 s91, 23 +buffer_load_b32 v21, off, s[88:91], null glc +s_setprio 0 +s_nop 1 +ds_load_b128 v[24:27], v3 offset:9280 +ds_store_b64 v15, v[12:13] offset:39680 +ds_load_b128 v[29:32], v3 offset:9856 +ds_load_b32 v144, v18 offset:39168 +s_setprio 2 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s27, v144 +v_pk_fma_f16 v24, v29, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v25, v30, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v26, v31, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v27, v32, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +s_setprio 0 +ds_load_b128 v[44:47], v3 offset:11584 +ds_load_b128 v[49:52], v3 offset:12160 +s_setprio 2 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 25 +s_pack_ll_b32_b16 s10, s10, s10 +s_bfm_b64 exec, s91, 0 +v_cmp_ne_u32 vcc, v21, s90 +s_cbranch_vccz 12 +buffer_load_b32 v21, off, s[88:91], null glc +s_cmp_eq_u32 s10, 0 +s_cselect_b32 vcc_lo, 0, 0x10000 +s_add_u32 s10, s10, vcc_lo +s_cbranch_scc1 2 +s_waitcnt vmcnt(0) +s_branch 65524 +s_and_b32 s91, 0xffff0000, s91 +s_mov_b32 s10, 0 +s_mov_b64 exec, -1 +s_mul_i32 s90, s90, 3 +s_and_b32 s90, s90, 0x3f3f3f3f +s_add_u32 s88, s88, 0x100 +s_and_b32 s88, s88, 0xfffff7ff +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3d54 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_cmp_le_u32 s9, 16 +s_cselect_b32 s99, -1, 4 +s_sub_u32 s99, s99, 1 +s_cselect_b32 s29, s98, s29 +s_bitset0_b32 s29, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v44, v49, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v45, v50, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v46, v51, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v47, v52, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v144, s29 +v_add_co_u32 v33, vcc, v5, s23 +v_pk_mad_u16 v23, v5, 0x20001, s35 +v_pk_mad_u16 v28, v5, 0x20001, s38 +_v_pk_min_u16__vop3p 22, 289, 261, 0x0, 0x3, 0x0, 0x0 +v_cndmask_b32 v43, 0, s42, vcc +v_cndmask_b32 v167, 0, s50, vcc +v_mad_u32_u16 v7, v23, 1, v6 op_sel:[0,0,0,0] +v_mad_u32_u16 v12, v28, 1, v11 op_sel:[0,0,0,0] +v_add3_u32 v6, v6, s36, v43 +v_add3_u32 v11, v11, s39, v167 +_v_pk_sub_u16__vop3p 22, 261, 278, 0x0, 0x3, 0x0, 0x0 +v_add_co_ci_u32 v4, s[54:55], v4, s15, vcc +v_cndmask_b32 v6, v6, 0x80000000, s[54:55] +v_cndmask_b32 v11, v11, 0x80000000, s[54:55] +v_cmp_lt_u16 vcc, v23, s34 +v_cndmask_b32 v7, 0x80000000, v7, vcc +v_cmp_lt_u16 vcc, v28, s37 +v_cndmask_b32 v12, 0x80000000, v12, vcc +_v_pk_ashrrev_i16__vop3p 22, 143, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_u16__vop3p 53, 279, 41, 0x1, 0x3, 0x0, 0x0 +_v_pk_add_u16__vop3p 48, 279, 40, 0x1, 0x3, 0x0, 0x0 +v_mad_u32_u16 v10, v53, s44, v7 op_sel:[1,0,0,0] +v_mad_u32_u16 v8, v48, s44, v7 op_sel:[1,0,0,0] +_v_pk_add_u16__vop3p 38, 284, 45, 0x1, 0x3, 0x0, 0x0 +_v_cmp_lt_u16__vop3 106, 53, 34, 0x3 +v_cndmask_b32 v10, 0x80000000, v10, vcc +_v_cmp_lt_u16__vop3 106, 48, 34, 0x3 +v_cndmask_b32 v8, 0x80000000, v8, vcc +v_mad_u32_u16 v13, v38, s52, v12 op_sel:[1,0,0,0] +v_mad_u32_u16 v9, v53, s44, v7 op_sel:[0,0,0,0] +v_mad_u32_u16 v7, v48, s44, v7 op_sel:[0,0,0,0] +_v_cmp_lt_u16__vop3 106, 38, 37, 0x3 +v_cndmask_b32 v13, 0x80000000, v13, vcc +_v_cmp_lt_u16__vop3 106, 53, 34, 0x2 +v_cndmask_b32 v9, 0x80000000, v9, vcc +_v_cmp_lt_u16__vop3 106, 48, 34, 0x2 +v_cndmask_b32 v7, 0x80000000, v7, vcc +v_mad_u32_u16 v12, v38, s52, v12 op_sel:[0,0,0,0] +v_pk_mad_u16 v5, v22, s22, v33 +_v_cmp_lt_u16__vop3 106, 38, 37, 0x2 +v_cndmask_b32 v12, 0x80000000, v12, vcc +v_add_co_u32 v22, vcc, v4, s8 +v_cndmask_b32 v144, s98, v144, vcc +s_setprio 0 +ds_load_b128 v[34:37], v3 +ds_store_b128 v16, v[7:10] offset:37120 +ds_load_b128 v[39:42], v3 offset:576 +ds_store_b32 v17, v144 offset:39168 +s_setprio 2 +s_sub_u32 s26, s26, 1 +s_cselect_b32 s91, 0x21010000, s91 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_add_nc_u32 v15, s53, v14 +v_mov_b32 v165, v163 +v_mov_b32 v166, v164 +v_pk_fma_f16 v147, v34, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v148, v35, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v149, v36, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v150, v37, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 34, 285, 290, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 35, 286, 291, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 36, 287, 292, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 288, 293, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 151, 290, 295, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 291, 296, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 292, 297, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 293, 298, 0x0, 0x3, 0x0, 0x0 +s_setprio 0 +ds_load_b64 v[163:164], v15 offset:39680 +ds_load_b128 v[54:57], v3 offset:2304 +ds_load_b128 v[59:62], v3 offset:2880 +s_setprio 2 +s_mov_b32 s92, s93 +s_mov_b32 s93, s94 +s_mov_b32 s94, s95 +s_mov_b32 s95, s96 +s_mov_b32 s96, s97 +s_mov_b32 s97, s27 +s_bitcmp1_b32 s92, 0 +s_cbranch_scc1 2181 +s_sub_u32 s49, s49, 1 +s_cselect_b32 s49, 3, s49 +s_lshl_b32 s53, s49, 9 +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3c94 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s75, 0x11014000, 0 +s_sub_u32 s69, s12, 1 +s_cselect_b32 s75, 0, s75 +s_mov_b64 s[72:73], s[20:21] +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v155, v54, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v156, v55, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v157, v56, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v158, v57, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 54, 305, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 55, 306, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 56, 307, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 57, 308, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 159, 310, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 311, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 312, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 313, 318, 0x0, 0x3, 0x0, 0x0 +s_add_u32 s11, s11, 0x100 +s_cbranch_scc0 7 +s_bitset0_b32 s91, 23 +s_lshl_b64 exec, 1, s90 +buffer_store_b8 v0, off, s[88:91], s4 +s_mov_b64 exec, -1 +s_mul_i32 s11, s11, 0xffffff01 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 3 +s_bitset1_b32 s91, 23 +buffer_load_b32 v21, off, s[88:91], null glc +s_setprio 0 +s_nop 1 +ds_load_b128 v[24:27], v3 offset:9280 +ds_store_b64 v15, v[12:13] offset:39680 +ds_load_b128 v[29:32], v3 offset:9856 +ds_load_b32 v144, v18 offset:39168 +s_setprio 2 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s27, v144 +v_pk_fma_f16 v24, v29, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v25, v30, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v26, v31, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v27, v32, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +s_setprio 0 +ds_load_b128 v[44:47], v3 offset:11584 +ds_load_b128 v[49:52], v3 offset:12160 +s_setprio 2 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 25 +s_pack_ll_b32_b16 s10, s10, s10 +s_bfm_b64 exec, s91, 0 +v_cmp_ne_u32 vcc, v21, s90 +s_cbranch_vccz 12 +buffer_load_b32 v21, off, s[88:91], null glc +s_cmp_eq_u32 s10, 0 +s_cselect_b32 vcc_lo, 0, 0x10000 +s_add_u32 s10, s10, vcc_lo +s_cbranch_scc1 2 +s_waitcnt vmcnt(0) +s_branch 65524 +s_and_b32 s91, 0xffff0000, s91 +s_mov_b32 s10, 0 +s_mov_b64 exec, -1 +s_mul_i32 s90, s90, 3 +s_and_b32 s90, s90, 0x3f3f3f3f +s_add_u32 s88, s88, 0x100 +s_and_b32 s88, s88, 0xfffff7ff +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3d54 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s27, 1 +s_cbranch_scc1 65244 +s_branch 65016 +s_setpc_b64 s[86:87] +s_bitcmp1_b32 s92, 3 +s_cbranch_scc0 40 +v_mov_b32 v64, 0 +v_mov_b32 v68, 0 +v_mov_b32 v65, 0 +v_mov_b32 v69, 0 +v_mov_b32 v66, 0 +v_mov_b32 v70, 0 +v_mov_b32 v67, 0 +v_mov_b32 v71, 0 +v_mov_b32 v80, 0 +v_mov_b32 v84, 0 +v_mov_b32 v81, 0 +v_mov_b32 v85, 0 +v_mov_b32 v82, 0 +v_mov_b32 v86, 0 +v_mov_b32 v83, 0 +v_mov_b32 v87, 0 +v_mov_b32 v96, 0 +v_mov_b32 v100, 0 +v_mov_b32 v97, 0 +v_mov_b32 v101, 0 +v_mov_b32 v98, 0 +v_mov_b32 v102, 0 +v_mov_b32 v99, 0 +v_mov_b32 v103, 0 +v_mov_b32 v112, 0 +v_mov_b32 v116, 0 +v_mov_b32 v113, 0 +v_mov_b32 v117, 0 +v_mov_b32 v114, 0 +v_mov_b32 v118, 0 +v_mov_b32 v115, 0 +v_mov_b32 v119, 0 +v_mov_b32 v128, 0 +v_mov_b32 v132, 0 +v_mov_b32 v129, 0 +v_mov_b32 v133, 0 +v_mov_b32 v130, 0 +v_mov_b32 v134, 0 +v_mov_b32 v131, 0 +v_mov_b32 v135, 0 +s_mov_b32 s85, 0x3e14 +s_cmp_le_u32 s9, 16 +s_cmov_b32 s85, 0x3c90 +s_setpc_b64 s[86:87] +s_bitcmp1_b32 s92, 3 +s_cbranch_scc0 40 +v_mov_b32 v72, 0 +v_mov_b32 v76, 0 +v_mov_b32 v73, 0 +v_mov_b32 v77, 0 +v_mov_b32 v74, 0 +v_mov_b32 v78, 0 +v_mov_b32 v75, 0 +v_mov_b32 v79, 0 +v_mov_b32 v88, 0 +v_mov_b32 v92, 0 +v_mov_b32 v89, 0 +v_mov_b32 v93, 0 +v_mov_b32 v90, 0 +v_mov_b32 v94, 0 +v_mov_b32 v91, 0 +v_mov_b32 v95, 0 +v_mov_b32 v104, 0 +v_mov_b32 v108, 0 +v_mov_b32 v105, 0 +v_mov_b32 v109, 0 +v_mov_b32 v106, 0 +v_mov_b32 v110, 0 +v_mov_b32 v107, 0 +v_mov_b32 v111, 0 +v_mov_b32 v120, 0 +v_mov_b32 v124, 0 +v_mov_b32 v121, 0 +v_mov_b32 v125, 0 +v_mov_b32 v122, 0 +v_mov_b32 v126, 0 +v_mov_b32 v123, 0 +v_mov_b32 v127, 0 +v_mov_b32 v136, 0 +v_mov_b32 v140, 0 +v_mov_b32 v137, 0 +v_mov_b32 v141, 0 +v_mov_b32 v138, 0 +v_mov_b32 v142, 0 +v_mov_b32 v139, 0 +v_mov_b32 v143, 0 +s_mov_b32 s85, 0x3e14 +s_cmp_le_u32 s9, 16 +s_cmov_b32 s85, 0x3c90 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 325, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 326, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 327, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v64, v147 +v_mov_b32 v65, v148 +v_mov_b32 v66, v149 +v_mov_b32 v67, v150 +v_mov_b32 v68, v151 +v_mov_b32 v69, v152 +v_mov_b32 v70, v153 +v_mov_b32 v71, v154 +s_mov_b32 s85, 0x3e80 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 328, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 329, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 330, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 331, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 332, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 333, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 334, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 335, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v72, v155 +v_mov_b32 v73, v156 +v_mov_b32 v74, v157 +v_mov_b32 v75, v158 +v_mov_b32 v76, v159 +v_mov_b32 v77, v160 +v_mov_b32 v78, v161 +v_mov_b32 v79, v162 +s_mov_b32 s85, 0x3eec +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 336, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 337, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 338, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 339, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 340, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 341, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 342, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 343, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v80, v147 +v_mov_b32 v81, v148 +v_mov_b32 v82, v149 +v_mov_b32 v83, v150 +v_mov_b32 v84, v151 +v_mov_b32 v85, v152 +v_mov_b32 v86, v153 +v_mov_b32 v87, v154 +s_mov_b32 s85, 0x3f58 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 344, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 345, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 346, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 347, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 348, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 349, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 350, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 351, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v88, v155 +v_mov_b32 v89, v156 +v_mov_b32 v90, v157 +v_mov_b32 v91, v158 +v_mov_b32 v92, v159 +v_mov_b32 v93, v160 +v_mov_b32 v94, v161 +v_mov_b32 v95, v162 +s_mov_b32 s85, 0x3fc4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 352, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 353, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 354, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 355, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 356, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 357, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 358, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 359, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v96, v147 +v_mov_b32 v97, v148 +v_mov_b32 v98, v149 +v_mov_b32 v99, v150 +v_mov_b32 v100, v151 +v_mov_b32 v101, v152 +v_mov_b32 v102, v153 +v_mov_b32 v103, v154 +s_mov_b32 s85, 0x4030 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 360, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 361, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 362, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 363, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 364, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 365, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 366, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 367, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v104, v155 +v_mov_b32 v105, v156 +v_mov_b32 v106, v157 +v_mov_b32 v107, v158 +v_mov_b32 v108, v159 +v_mov_b32 v109, v160 +v_mov_b32 v110, v161 +v_mov_b32 v111, v162 +s_mov_b32 s85, 0x409c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 368, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 369, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 370, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 371, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 372, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 373, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 374, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 375, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v112, v147 +v_mov_b32 v113, v148 +v_mov_b32 v114, v149 +v_mov_b32 v115, v150 +v_mov_b32 v116, v151 +v_mov_b32 v117, v152 +v_mov_b32 v118, v153 +v_mov_b32 v119, v154 +s_mov_b32 s85, 0x4108 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 376, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 377, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 378, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 379, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 380, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 381, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 382, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 383, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v120, v155 +v_mov_b32 v121, v156 +v_mov_b32 v122, v157 +v_mov_b32 v123, v158 +v_mov_b32 v124, v159 +v_mov_b32 v125, v160 +v_mov_b32 v126, v161 +v_mov_b32 v127, v162 +s_mov_b32 s85, 0x4174 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 384, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 385, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 386, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 387, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 388, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 389, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 390, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 391, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v128, v147 +v_mov_b32 v129, v148 +v_mov_b32 v130, v149 +v_mov_b32 v131, v150 +v_mov_b32 v132, v151 +v_mov_b32 v133, v152 +v_mov_b32 v134, v153 +v_mov_b32 v135, v154 +s_mov_b32 s85, 0x41e0 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 392, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 393, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 394, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 395, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 396, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 397, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 398, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 399, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v136, v155 +v_mov_b32 v137, v156 +v_mov_b32 v138, v157 +v_mov_b32 v139, v158 +v_mov_b32 v140, v159 +v_mov_b32 v141, v160 +v_mov_b32 v142, v161 +v_mov_b32 v143, v162 +s_mov_b32 s85, 0x3e14 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 403, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 404, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 405, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 406, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 147, 403, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 148, 404, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 149, 405, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 150, 406, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 407, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 408, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 409, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 410, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 151, 407, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 152, 408, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 153, 409, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 154, 410, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v147, v165, s[72:75], 0 idxen +buffer_store_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v147, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v148, v165, s[72:75], 0 idxen +buffer_store_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v148, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v149, v165, s[72:75], 0 idxen +buffer_store_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v149, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v150, v165, s[72:75], 0 idxen +buffer_store_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v150, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x4418 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 411, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 412, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 413, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 414, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 155, 411, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 156, 412, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 157, 413, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 158, 414, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 415, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 416, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 417, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 418, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 159, 415, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 160, 416, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 161, 417, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 162, 418, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v155, v165, s[72:75], 0 idxen +buffer_store_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v155, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v156, v165, s[72:75], 0 idxen +buffer_store_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v156, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v157, v165, s[72:75], 0 idxen +buffer_store_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v157, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v158, v165, s[72:75], 0 idxen +buffer_store_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v158, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x424c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 403, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 404, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 405, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 406, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 147, 403, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 148, 404, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 149, 405, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 150, 406, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 407, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 408, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 409, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 410, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 151, 407, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 152, 408, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 153, 409, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 154, 410, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v147, v165, s[72:75], 0 idxen +buffer_store_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v147, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v148, v165, s[72:75], 0 idxen +buffer_store_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v148, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v149, v165, s[72:75], 0 idxen +buffer_store_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v149, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v150, v165, s[72:75], 0 idxen +buffer_store_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v150, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x47b0 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 411, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 412, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 413, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 414, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 155, 411, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 156, 412, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 157, 413, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 158, 414, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 415, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 416, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 417, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 418, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 159, 415, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 160, 416, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 161, 417, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 162, 418, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v155, v165, s[72:75], 0 idxen +buffer_store_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v155, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v156, v165, s[72:75], 0 idxen +buffer_store_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v156, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v157, v165, s[72:75], 0 idxen +buffer_store_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v157, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v158, v165, s[72:75], 0 idxen +buffer_store_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v158, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x45e4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 147, 0xbdc5bdc5, 403, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 148, 0xbdc5bdc5, 404, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 149, 0xbdc5bdc5, 405, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 150, 0xbdc5bdc5, 406, 0x0, 0x3 +v_exp_f16 v147, v147 +v_exp_f16 v148, v148 +v_exp_f16 v149, v149 +v_exp_f16 v150, v150 +_v_exp_f16__vop3 147, 147, 0x9 +_v_exp_f16__vop3 148, 148, 0x9 +_v_exp_f16__vop3 149, 149, 0x9 +_v_exp_f16__vop3 150, 150, 0x9 +_v_pk_add_f16__vop3p_lit 147, 0x3c003c00, 403, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 148, 0x3c003c00, 404, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 149, 0x3c003c00, 405, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 150, 0x3c003c00, 406, 0x0, 0x3 +v_rcp_f16 v147, v147 +v_rcp_f16 v148, v148 +v_rcp_f16 v149, v149 +v_rcp_f16 v150, v150 +_v_rcp_f16__vop3 147, 147, 0x9 +_v_rcp_f16__vop3 148, 148, 0x9 +_v_rcp_f16__vop3 149, 149, 0x9 +_v_rcp_f16__vop3 150, 150, 0x9 +_v_pk_mul_f16__vop3p_lit 151, 0xbdc5bdc5, 407, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 152, 0xbdc5bdc5, 408, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 153, 0xbdc5bdc5, 409, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 154, 0xbdc5bdc5, 410, 0x0, 0x3 +v_exp_f16 v151, v151 +v_exp_f16 v152, v152 +v_exp_f16 v153, v153 +v_exp_f16 v154, v154 +_v_exp_f16__vop3 151, 151, 0x9 +_v_exp_f16__vop3 152, 152, 0x9 +_v_exp_f16__vop3 153, 153, 0x9 +_v_exp_f16__vop3 154, 154, 0x9 +_v_pk_add_f16__vop3p_lit 151, 0x3c003c00, 407, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 152, 0x3c003c00, 408, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 153, 0x3c003c00, 409, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 154, 0x3c003c00, 410, 0x0, 0x3 +v_rcp_f16 v151, v151 +v_rcp_f16 v152, v152 +v_rcp_f16 v153, v153 +v_rcp_f16 v154, v154 +_v_rcp_f16__vop3 151, 151, 0x9 +_v_rcp_f16__vop3 152, 152, 0x9 +_v_rcp_f16__vop3 153, 153, 0x9 +_v_rcp_f16__vop3 154, 154, 0x9 +buffer_store_b16 v147, v165, s[72:75], 0 idxen +buffer_store_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v147, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v148, v165, s[72:75], 0 idxen +buffer_store_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v148, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v149, v165, s[72:75], 0 idxen +buffer_store_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v149, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v150, v165, s[72:75], 0 idxen +buffer_store_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v150, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x4c88 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 155, 0xbdc5bdc5, 411, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 156, 0xbdc5bdc5, 412, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 157, 0xbdc5bdc5, 413, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 158, 0xbdc5bdc5, 414, 0x0, 0x3 +v_exp_f16 v155, v155 +v_exp_f16 v156, v156 +v_exp_f16 v157, v157 +v_exp_f16 v158, v158 +_v_exp_f16__vop3 155, 155, 0x9 +_v_exp_f16__vop3 156, 156, 0x9 +_v_exp_f16__vop3 157, 157, 0x9 +_v_exp_f16__vop3 158, 158, 0x9 +_v_pk_add_f16__vop3p_lit 155, 0x3c003c00, 411, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 156, 0x3c003c00, 412, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 157, 0x3c003c00, 413, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 158, 0x3c003c00, 414, 0x0, 0x3 +v_rcp_f16 v155, v155 +v_rcp_f16 v156, v156 +v_rcp_f16 v157, v157 +v_rcp_f16 v158, v158 +_v_rcp_f16__vop3 155, 155, 0x9 +_v_rcp_f16__vop3 156, 156, 0x9 +_v_rcp_f16__vop3 157, 157, 0x9 +_v_rcp_f16__vop3 158, 158, 0x9 +_v_pk_mul_f16__vop3p_lit 159, 0xbdc5bdc5, 415, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 160, 0xbdc5bdc5, 416, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 161, 0xbdc5bdc5, 417, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 162, 0xbdc5bdc5, 418, 0x0, 0x3 +v_exp_f16 v159, v159 +v_exp_f16 v160, v160 +v_exp_f16 v161, v161 +v_exp_f16 v162, v162 +_v_exp_f16__vop3 159, 159, 0x9 +_v_exp_f16__vop3 160, 160, 0x9 +_v_exp_f16__vop3 161, 161, 0x9 +_v_exp_f16__vop3 162, 162, 0x9 +_v_pk_add_f16__vop3p_lit 159, 0x3c003c00, 415, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 160, 0x3c003c00, 416, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 161, 0x3c003c00, 417, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 162, 0x3c003c00, 418, 0x0, 0x3 +v_rcp_f16 v159, v159 +v_rcp_f16 v160, v160 +v_rcp_f16 v161, v161 +v_rcp_f16 v162, v162 +_v_rcp_f16__vop3 159, 159, 0x9 +_v_rcp_f16__vop3 160, 160, 0x9 +_v_rcp_f16__vop3 161, 161, 0x9 +_v_rcp_f16__vop3 162, 162, 0x9 +buffer_store_b16 v155, v165, s[72:75], 0 idxen +buffer_store_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v155, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v156, v165, s[72:75], 0 idxen +buffer_store_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v156, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v157, v165, s[72:75], 0 idxen +buffer_store_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v157, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v158, v165, s[72:75], 0 idxen +buffer_store_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v158, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x497c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 147, 403, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 148, 404, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 149, 405, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 150, 406, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 151, 407, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 152, 408, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 153, 409, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 154, 410, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 147, 403, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 148, 404, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 149, 405, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 150, 406, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v147 +v_and_b32 v28, 0x7fff7fff, v148 +v_and_b32 v38, 0x7fff7fff, v149 +v_and_b32 v48, 0x7fff7fff, v150 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 147, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 148, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 149, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 150, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 147, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 148, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 149, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 150, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v147, 0x7fff7fff, v23, v147 +v_bfi_b32 v148, 0x7fff7fff, v33, v148 +v_bfi_b32 v149, 0x7fff7fff, v43, v149 +v_bfi_b32 v150, 0x7fff7fff, v53, v150 +_v_pk_mul_f16__vop3p 147, 403, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 148, 404, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 149, 405, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 150, 406, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 151, 407, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 152, 408, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 153, 409, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 154, 410, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v151 +v_and_b32 v28, 0x7fff7fff, v152 +v_and_b32 v38, 0x7fff7fff, v153 +v_and_b32 v48, 0x7fff7fff, v154 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 151, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 152, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 153, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 154, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 151, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 152, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 153, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 154, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v151, 0x7fff7fff, v23, v151 +v_bfi_b32 v152, 0x7fff7fff, v33, v152 +v_bfi_b32 v153, 0x7fff7fff, v43, v153 +v_bfi_b32 v154, 0x7fff7fff, v53, v154 +_v_pk_mul_f16__vop3p 151, 407, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 152, 408, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 153, 409, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 154, 410, 32, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v147, v165, s[72:75], 0 idxen +buffer_store_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v147, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v151, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v148, v165, s[72:75], 0 idxen +buffer_store_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v148, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v152, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v149, v165, s[72:75], 0 idxen +buffer_store_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v149, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v153, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v150, v165, s[72:75], 0 idxen +buffer_store_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v150, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v154, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x5620 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 155, 411, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 156, 412, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 157, 413, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 158, 414, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 159, 415, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 160, 416, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 161, 417, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 162, 418, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 155, 411, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 156, 412, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 157, 413, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 158, 414, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v155 +v_and_b32 v28, 0x7fff7fff, v156 +v_and_b32 v38, 0x7fff7fff, v157 +v_and_b32 v48, 0x7fff7fff, v158 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 155, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 156, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 157, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 158, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 155, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 156, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 157, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 158, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v155, 0x7fff7fff, v23, v155 +v_bfi_b32 v156, 0x7fff7fff, v33, v156 +v_bfi_b32 v157, 0x7fff7fff, v43, v157 +v_bfi_b32 v158, 0x7fff7fff, v53, v158 +_v_pk_mul_f16__vop3p 155, 411, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 156, 412, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 157, 413, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 158, 414, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 159, 415, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 160, 416, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 417, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 418, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v159 +v_and_b32 v28, 0x7fff7fff, v160 +v_and_b32 v38, 0x7fff7fff, v161 +v_and_b32 v48, 0x7fff7fff, v162 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 159, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 160, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 161, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 162, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 159, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 160, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 161, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 162, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v159, 0x7fff7fff, v23, v159 +v_bfi_b32 v160, 0x7fff7fff, v33, v160 +v_bfi_b32 v161, 0x7fff7fff, v43, v161 +v_bfi_b32 v162, 0x7fff7fff, v53, v162 +_v_pk_mul_f16__vop3p 159, 415, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 160, 416, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 417, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 418, 32, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v155, v165, s[72:75], 0 idxen +buffer_store_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v155, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v159, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v156, v165, s[72:75], 0 idxen +buffer_store_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v156, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v160, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v157, v165, s[72:75], 0 idxen +buffer_store_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v157, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v161, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v158, v165, s[72:75], 0 idxen +buffer_store_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v158, v165, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v162, v166, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x4f94 +s_setpc_b64 s[86:87] +s_endpgm +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end + diff --git a/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc new file mode 100644 index 0000000000..e1afab3c1d --- /dev/null +++ b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc @@ -0,0 +1,4640 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +.macro _sop1_lit op:req, sdst:req, lit:req + .long (0b101111101 << 23) | (\sdst << 16) | (\op << 8) | 255 + .long \lit +.endm + +.macro _s_mov_b32__sop1_lit sdst:req, lit:req + _sop1_lit 0, \sdst, \lit +.endm + +.macro _vop1 op:req, vdst:req, src:req + .long (0b0111111 << 25) | (\vdst << 17) | (\op << 9) | \src +.endm + +.macro _v_cvt_f16_i16__vop1 vdst:req, vsrc:req + _vop1 81, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _v_rcp_f16__vop1 vdst:req, vsrc:req + _vop1 84, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _v_exp_f16__vop1 vdst:req, vsrc:req + _vop1 88, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _vop3 op:req, vdst:req, src0:req, src1:req, src2:req, opsel:req, abs:req, neg:req + .long (0b110101 << 26) | (\op << 16) | (\opsel << 11) | (\abs << 8) | \vdst + .long (\neg << 29) | (\src2 << 18) | (\src1 << 9) | \src0 +.endm + +.macro _vop3_lit op:req, vdst:req, src0:req, src1:req, src2:req, lit:req, opsel:req, abs:req, neg:req + .long (0b110101 << 26) | (\op << 16) | (\opsel << 11) | (\abs << 8) | \vdst + .long (\neg << 29) | (\src2 << 18) | (\src1 << 9) | \src0 + .long \lit +.endm + +.macro _v_cvt_f16_i16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 465, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_rcp_f16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 468, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_exp_f16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 472, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_cndmask_b16__vop3 vdst:req, vsrc0:req, vsrc1:req, src2:req, opsel:req + _vop3 605, \vdst, (\vsrc0 + /*VGPR*/ 256), (\vsrc1 + /*VGPR*/ 256), \src2, \opsel, 0, 0 +.endm + +.macro _v_cmp_gt_f16__vop3_s_lit sdst:req, ssrc0:req, lit:req, opsel:req, abs:req + _vop3_lit 4, \sdst, \ssrc0, 255, 0, \lit, \opsel, \abs, 0 +.endm + +.macro _v_cmp_gt_f16__vop3_v_lit sdst:req, vsrc0:req, lit:req, opsel:req, abs:req + _vop3_lit 4, \sdst, (\vsrc0 + /*VGPR*/ 256), 255, 0, \lit, \opsel, \abs, 0 +.endm + +.macro _v_cmp_lt_u16__vop3 sdst:req, vsrc0:req, ssrc1:req, opsel:req + _vop3 57, \sdst, (\vsrc0 + /*VGPR*/ 256), \ssrc1, 0, \opsel, 0, 0 +.endm + +.macro _v_cmpx_lt_u32__vop3 sdst:req, vsrc0:req, ssrc1:req + _vop3 201, \sdst, (\vsrc0 + /*VGPR*/ 256), \ssrc1, 0, 0, 0, 0 +.endm + +.macro _vop3p op:req, vdst:req, src0:req, src1:req, src2:req, opsel:req, opsel_hi:req, opsel_hi2:req, neg:req, neg_hi:req + .long (0b11001100 << 24) | (\op << 16) | (\opsel_hi2 << 14) | (\opsel << 11) | (\neg_hi << 8) | \vdst + .long (\neg << 29) | (\opsel_hi << 27) | (\src2 << 18) | (\src1 << 9) | \src0 +.endm + +.macro _vop3p_lit op:req, vdst:req, src0:req, src1:req, src2:req, lit:req, opsel:req, opsel_hi:req, opsel_hi2:req, neg:req, neg_hi:req + .long (0b11001100 << 24) | (\op << 16) | (\opsel_hi2 << 14) | (\opsel << 11) | (\neg_hi << 8) | \vdst + .long (\neg << 29) | (\opsel_hi << 27) | (\src2 << 18) | (\src1 << 9) | \src0 + .long \lit +.endm + +.macro _v_pk_ashrrev_i16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 6, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 10, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_sub_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 11, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_min_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 13, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 15, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_f16__vop3p_lit vdst:req, lit:req, src1:req, opsel:req, opsel_hi:req + _vop3p_lit 15, \vdst, 255, \src1, 0, \lit, \opsel, \opsel_hi, 0, 0, 0 +.endm + +.macro _v_pk_mul_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 16, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_mul_f16__vop3p_lit vdst:req, lit:req, src1:req, opsel:req, opsel_hi:req + _vop3p_lit 16, \vdst, 255, \src1, 0, \lit, \opsel, \opsel_hi, 0, 0, 0 +.endm + +.macro _v_pk_min_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 17, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_max_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 18, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +s_version 0x2006 +s_set_inst_prefetch_distance 0x3 +s_mov_b32 s0, 0 +v_lshlrev_b32 v1, 7, v0 +s_getpc_b64 s[8:9] +s_mov_b32 s10, 0x6244 +s_mov_b32 s11, 0x31014000 +buffer_load_b32 v2, v1, s[8:11], 0 offen +s_waitcnt vmcnt(0) +s_getpc_b64 s[6:7] +s_load_b512 s[8:23], s[2:3], null +s_load_b512 s[24:39], s[2:3], 0x40 +s_load_b512 s[40:55], s[2:3], 0x80 +s_load_b256 s[56:63], s[2:3], 0xc0 +s_load_b64 s[64:65], s[2:3], 0xe0 +v_and_b32 v8, 0xff, v0 +v_lshrrev_b32 v9, 1, v8 +v_and_b32 v10, 1, v0 +v_add_nc_u32 v5, v9, 32 +v_bfi_b32 v6, 31, v8, v9 +v_bfe_u32 v4, v8, 5, 1 +v_bfi_b32 v6, 0xbf, v6, v5 +v_and_b32 v2, 31, v8 +v_lshrrev_b32 v6, 5, v6 +v_lshrrev_b32 v7, 6, v8 +v_lshlrev_b32 v2, 4, v2 +v_and_b32 v3, 31, v9 +v_mad_u32_u24 v2, v4, 0x900, v2 +v_lshlrev_b32 v3, 4, v3 +v_xor_b32 v5, 3, v6 +v_mad_u32_u16 v3, 0x480, v7, v3 op_sel:[0,0,0,0] +v_mad_u32_u24 v1, v5, 0x240, v2 +v_mad_u32_u16 v3, 0x1240, v10, v3 op_sel:[0,0,0,0] +v_mad_u32_u24 v2, v6, 0x240, v2 +s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) +s_bitcmp1_b32 s14, 6 +s_cbranch_scc0 14 +s_load_b64 s[16:17], s[16:17], null +s_load_b64 s[20:21], s[20:21], null +s_load_b64 s[18:19], s[18:19], null +s_cmp_eq_u64 0, s[60:61] +s_cbranch_scc1 2 +s_load_b64 s[60:61], s[60:61], null +s_cmp_eq_u64 0, s[30:31] +s_cbranch_scc1 2 +s_load_b64 s[30:31], s[30:31], null +s_bitcmp1_b32 s14, 3 +s_cbranch_scc0 2 +s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xf0 +s_cmp_eq_u32 s13, 0x60 +s_cbranch_scc0 16 +s_mul_i32 s1, s4, 0xab +s_lshr_b32 s1, s1, 10 +s_mul_i32 s23, s1, 6 +s_sub_u32 s23, s4, s23 +s_bfe_u32 s15, s1, 0x20000 +s_bfe_u32 s22, s1, 0x10002 +s_bfe_u32 s5, s1, 0x10003 +s_mov_b32 s45, s23 +s_lshl1_add_u32 s45, s45, s22 +s_lshl2_add_u32 s45, s45, s15 +s_lshl1_add_u32 s45, s45, s5 +s_mov_b32 s4, s45 +s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) +s_bitcmp1_b32 s14, 13 +s_cbranch_scc0 10 +s_add_u32 s16, s16, s34 +s_addc_u32 s17, s17, s35 +s_add_u32 s20, s20, s38 +s_addc_u32 s21, s21, s39 +s_add_u32 s18, s18, s36 +s_addc_u32 s19, s19, s37 +s_cmp_eq_u64 0, s[30:31] +s_cselect_b64 s[40:41], 0, s[40:41] +s_add_u32 s30, s30, s40 +s_addc_u32 s31, s31, s41 +s_add_u32 s15, s12, 15 +s_lshr_b32 s15, s15, 4 +v_cvt_f32_u32 v4, s15 +v_rcp_f32 v4, v4 +v_mul_f32 v4, 0x47800000, v4 +v_cvt_floor_i32_f32 v4, v4 +v_mad_u32_u24 v5, v4, s13, s13 +v_lshrrev_b32 v5, 16, v5 +v_cvt_f32_u32 v4, v5 +v_rcp_f32 v4, v4 +v_mul_f32 v4, 0x47800000, v4 +v_cvt_floor_i32_f32 v4, v4 +v_mad_u32_u24 v6, v4, s4, s4 +v_lshrrev_b32 v6, 16, v6 +v_readfirstlane_b32 s1, v5 +v_readfirstlane_b32 s22, v6 +s_mul_i32 s5, s22, s1 +s_sub_u32 s5, s4, s5 +s_cmp_ge_u32 s22, s15 +s_cbranch_scc1 6159 +s_mul_i32 s13, s1, s15 +s_mul_i32 s23, s22, 16 +s_sub_u32 s12, s12, s23 +s_min_u32 s12, s12, 16 +s_mul_i32 s34, s23, s46 +s_mul_hi_u32 s35, s23, s46 +s_lshl_b64 s[34:35], s[34:35], 1 +s_add_u32 s18, s34, s18 +s_addc_u32 s19, s35, s19 +s_lshr_b32 s35, s23, 0 +s_mul_i32 s34, s35, s51 +s_mul_hi_u32 s35, s35, s51 +s_lshl_b64 s[34:35], s[34:35], 1 +s_add_u32 s20, s34, s20 +s_addc_u32 s21, s35, s21 +s_lshl_b32 s34, s23, 1 +s_cmp_eq_u64 s[30:31], 0 +s_cselect_b32 s34, 0, s34 +s_add_u32 s30, s30, s34 +s_addc_u32 s31, s31, 0 +v_cmp_lt_u32 vcc, v0, 0x100 +s_cbranch_vccz 2749 +v_and_b32 v20, 0xff, v0 +v_lshrrev_b32 v21, 1, v20 +v_bfe_u32 v17, v20, 3, 1 +v_bfe_u32 v16, v20, 2, 1 +v_mad_u32_u16 v17, v17, 16, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v14, v16, 0x1240, v17 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 0, 2 +v_mad_u32_u16 v14, v16, 0x90, v14 op_sel:[0,0,0,0] +v_bfe_u32 v17, v20, 4, 2 +v_mad_u32_u16 v14, v17, 32, v14 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 6, 1 +v_mad_u32_u16 v14, v16, 0x480, v14 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 7, 1 +v_mad_u32_u16 v14, v16, 0x900, v14 op_sel:[0,0,0,0] +v_bfe_u32 v18, v20, 1, 2 +v_mad_u32_u16 v13, v18, 32, 0 op_sel:[0,0,0,0] +v_bfe_u32 v19, v20, 3, 1 +v_mad_u32_u16 v13, v19, 0x480, v13 op_sel:[0,0,0,0] +v_add_nc_u32 v18, v21, 32 +v_bfi_b32 v18, 0xbf, v20, v18 +v_bfe_u32 v18, v18, 6, 2 +v_mad_u32_u16 v13, v18, 0x90, v13 op_sel:[0,0,0,0] +v_xor_b32 v16, v0, v0 quad_perm:[2,3,2,1] +v_xor_b32 v17, v0, v0 quad_perm:[0,0,3,3] +v_sub_nc_u16 v16, v16, v17 op_sel:[0,0,0] +v_cvt_f16_i16 v15, v16 +_v_cvt_f16_i16__vop1 (15 | /*op_sel*/ 0x80), 17 +_v_pk_mul_f16__vop3p 15, 271, 240, 0x0, 0x1, 0x0, 0x0 +v_bfe_u32 v16, v0, 6, 1 +v_and_b32 v5, 63, v0 +v_cmp_eq_u32 vcc, v16, 1 +v_cndmask_b32 v16, 0, 0x400, vcc +v_cndmask_b32 v17, 0, 0x100, vcc +v_lshl_add_u32 v6, v5, 2, 0 +v_lshl_add_u32 v5, v5, 4, v16 +s_mov_b32 s23, 4 +s_mov_b32 s34, 0 +s_mov_b32 s40, 0xbc00c000 +v_readfirstlane_b32 s74, v0 +s_and_b32 null, 64, s74 +s_cmov_b32 s40, 0x3c00c000 +s_lshl_b32 s49, s43, 1 +s_lshl_b32 s53, s47, 1 +s_lshl_b32 s75, s49, 3 +s_lshl_b32 s76, s53, 3 +s_and_b32 null, 0x80, s74 +s_cselect_b32 s75, s75, 0 +s_cselect_b32 s76, s76, 0 +s_cselect_b32 s22, 8, 0 +s_sub_u32 s22, s9, s22 +s_cmov_b32 s22, 0 +s_mov_b32 s35, 0x11014000 +s_bitcmp1_b32 s14, 4 +s_cselect_b32 s77, 0, 0x8000000 +s_and_b32 s35, 0xf7ffffff, s35 +s_or_b32 s35, s35, s77 +s_and_b32 s17, s17, 0xffff +s_add_u32 s17, s17, 0x20000 +s_and_b32 s19, s19, 0xffff +s_add_u32 s19, s19, 0x20000 +s_add_u32 s16, s16, s75 +s_addc_u32 s17, s17, 0 +s_add_u32 s18, s18, s76 +s_addc_u32 s19, s19, 0 +s_mov_b64 s[36:37], s[16:17] +s_mov_b32 s38, 0x80000000 +s_mov_b32 s39, 0 +s_getpc_b64 s[64:65] +v_cmp_lt_u32 vcc, v0, 0x80 +s_cmp_gt_u32 vcc_lo, 0 +s_mov_b32 s74, 0x23d8 +s_mov_b32 s76, 0x1a58 +s_cmov_b32 s74, 0x1e98 +s_cmov_b32 s76, 0x1618 +s_mov_b32 s75, 0x2654 +s_mov_b32 s77, 0x1c54 +s_cmov_b32 s75, 0x2114 +s_cmov_b32 s77, 0x1814 +s_add_u32 s66, s64, s74 +s_addc_u32 s67, s65, 0 +s_add_u32 s70, s64, s76 +s_addc_u32 s71, s65, 0 +s_add_u32 s68, s64, s75 +s_addc_u32 s69, s65, 0 +s_add_u32 s72, s64, s77 +s_addc_u32 s73, s65, 0 +s_mov_b32 s45, 0 +v_mov_b32 v4, 0 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 272, 273, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 308, 341, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 360, 377, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 396, 397, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v16, s40, v34 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v52, s40, v86 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v104, s40, v122 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v140, s40, v142 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5911 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 273, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 341, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 377, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 397, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v17, v160 quad_perm:[1,0,3,2] +v_mov_b32 v85, v161 quad_perm:[1,0,3,2] +v_mov_b32 v121, v162 quad_perm:[1,0,3,2] +v_mov_b32 v141, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v121, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v141, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v17, v160 quad_perm:[2,3,0,1] +v_mov_b32 v85, v161 quad_perm:[2,3,0,1] +v_mov_b32 v121, v162 quad_perm:[2,3,0,1] +v_mov_b32 v141, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v121, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v141, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v121, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v141, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5812 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v160, v34, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v86, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v122, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v142, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 290, 291, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 342, 343, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 378, 379, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 398, 399, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5738 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 290, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 342, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 378, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 398, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v34, v160 quad_perm:[1,0,3,2] +v_mov_b32 v86, v161 quad_perm:[1,0,3,2] +v_mov_b32 v122, v162 quad_perm:[1,0,3,2] +v_mov_b32 v142, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v34, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v86, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v34, v160 quad_perm:[2,3,0,1] +v_mov_b32 v86, v161 quad_perm:[2,3,0,1] +v_mov_b32 v122, v162 quad_perm:[2,3,0,1] +v_mov_b32 v142, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v34, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v86, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v86, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5640 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 403, 402, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 407, 406, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 411, 410, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 415, 414, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v147, s40, v144 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v151, s40, v148 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v155, s40, v152 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v159, s40, v156 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5565 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 402, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 406, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 410, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 414, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v146, v160 quad_perm:[1,0,3,2] +v_mov_b32 v150, v161 quad_perm:[1,0,3,2] +v_mov_b32 v154, v162 quad_perm:[1,0,3,2] +v_mov_b32 v158, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v154, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v158, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v146, v160 quad_perm:[2,3,0,1] +v_mov_b32 v150, v161 quad_perm:[2,3,0,1] +v_mov_b32 v154, v162 quad_perm:[2,3,0,1] +v_mov_b32 v158, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v154, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v158, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v154, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v158, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5466 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v160, v144, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v148, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v152, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v156, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 400, 401, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 404, 405, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 408, 409, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 412, 413, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5392 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 400, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 404, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 408, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 412, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v144, v160 quad_perm:[1,0,3,2] +v_mov_b32 v148, v161 quad_perm:[1,0,3,2] +v_mov_b32 v152, v162 quad_perm:[1,0,3,2] +v_mov_b32 v156, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v144, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v148, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v144, v160 quad_perm:[2,3,0,1] +v_mov_b32 v148, v161 quad_perm:[2,3,0,1] +v_mov_b32 v152, v162 quad_perm:[2,3,0,1] +v_mov_b32 v156, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v144, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v148, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v144, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v148, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5294 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 272, 273, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 308, 341, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 291, 290, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 343, 342, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v16, s40, v121 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v52, s40, v141 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v35, s40, v122 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v87, s40, v142 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5219 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 273, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 341, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 290, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 342, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v17, v160 quad_perm:[1,0,3,2] +v_mov_b32 v85, v161 quad_perm:[1,0,3,2] +v_mov_b32 v34, v162 quad_perm:[1,0,3,2] +v_mov_b32 v86, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v34, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v86, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v17, v160 quad_perm:[2,3,0,1] +v_mov_b32 v85, v161 quad_perm:[2,3,0,1] +v_mov_b32 v34, v162 quad_perm:[2,3,0,1] +v_mov_b32 v86, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v17, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v85, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v34, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v86, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v85, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v86, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v87, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5120 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v160, v121, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v141, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v122, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v142, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 377, 360, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 397, 396, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 378, 379, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 398, 399, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5046 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 377, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 397, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 378, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 398, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v121, v160 quad_perm:[1,0,3,2] +v_mov_b32 v141, v161 quad_perm:[1,0,3,2] +v_mov_b32 v122, v162 quad_perm:[1,0,3,2] +v_mov_b32 v142, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v121, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v141, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v121, v160 quad_perm:[2,3,0,1] +v_mov_b32 v141, v161 quad_perm:[2,3,0,1] +v_mov_b32 v122, v162 quad_perm:[2,3,0,1] +v_mov_b32 v142, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v121, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v141, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v122, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v142, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v121, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v104, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v122, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v123, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v141, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v140, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v142, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v143, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4948 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_add_f16__vop3p 160, 403, 402, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 161, 407, 406, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 162, 401, 400, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 163, 405, 404, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v164, v147, s40, v154 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v165, v151, s40, v158 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v166, v145, s40, v152 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v167, v149, s40, v156 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4873 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[64:65], s[66:67], s[70:71] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(32) +_v_pk_mul_f16__vop3p 160, 402, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 406, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 400, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 404, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v146, v160 quad_perm:[1,0,3,2] +v_mov_b32 v150, v161 quad_perm:[1,0,3,2] +v_mov_b32 v144, v162 quad_perm:[1,0,3,2] +v_mov_b32 v148, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v144, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v148, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v146, v160 quad_perm:[2,3,0,1] +v_mov_b32 v150, v161 quad_perm:[2,3,0,1] +v_mov_b32 v144, v162 quad_perm:[2,3,0,1] +v_mov_b32 v148, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v146, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v150, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v144, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v148, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v146, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v147, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v144, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v145, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v150, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v151, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v148, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v149, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4774 +s_mov_b32 s56, 0xffffebec +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0xffffed18 +s_setprio 2 +v_pk_fma_f16 v160, v154, s40, v164 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v158, s40, v165 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v152, s40, v166 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v156, s40, v167 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 164, 410, 411, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 165, 414, 415, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 166, 408, 409, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 167, 412, 413, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4700 +s_mov_b32 s56, 0xffffea64 +s_bitcmp1_b32 s45, 4 +s_cselect_b64 s[64:65], s[68:69], s[72:73] +s_bitcmp1_b32 s45, 2 +s_cselect_b32 s56, s56, 0xffffeb90 +s_setprio 2 +_v_pk_mul_f16__vop3p 160, 410, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 161, 414, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 162, 408, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 163, 412, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v154, v160 quad_perm:[1,0,3,2] +v_mov_b32 v158, v161 quad_perm:[1,0,3,2] +v_mov_b32 v152, v162 quad_perm:[1,0,3,2] +v_mov_b32 v156, v163 quad_perm:[1,0,3,2] +v_pk_fma_f16 v160, v154, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v158, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v154, v160 quad_perm:[2,3,0,1] +v_mov_b32 v158, v161 quad_perm:[2,3,0,1] +v_mov_b32 v152, v162 quad_perm:[2,3,0,1] +v_mov_b32 v156, v163 quad_perm:[2,3,0,1] +v_pk_fma_f16 v160, v154, v15, v160 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v161, v158, v15, v161 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v162, v152, v15, v162 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v163, v156, v15, v163 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v154, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v155, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v152, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v153, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v158, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v159, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v156, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v157, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 4602 +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s41, v4 +v_mov_b32 v69, v36 +v_mov_b32 v70, v37 +v_mov_b32 v71, v38 +v_mov_b32 v72, v39 +v_mov_b32 v73, v40 +v_mov_b32 v74, v41 +v_mov_b32 v75, v42 +v_mov_b32 v76, v43 +_v_pk_add_f16__vop3p 88, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 89, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 90, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 91, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 92, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 93, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 94, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 95, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 88, 344, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 89, 345, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 90, 346, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 91, 347, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 92, 348, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 93, 349, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 94, 350, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 95, 351, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v88, v44, 0.5, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v89, v45, 0.5, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v90, v46, 0.5, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v91, v47, 0.5, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v92, v48, 0.5, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v93, v49, 0.5, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v94, v50, 0.5, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v95, v51, 0.5, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v105, v44, -1.0, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v106, v45, -1.0, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v107, v46, -1.0, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v108, v47, -1.0, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v109, v48, -1.0, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v110, v49, -1.0, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v111, v50, -1.0, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v112, v51, -1.0, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v124, v61 +v_mov_b32 v125, v62 +v_mov_b32 v126, v63 +v_mov_b32 v127, v64 +v_mov_b32 v128, v65 +v_mov_b32 v129, v66 +v_mov_b32 v130, v67 +v_mov_b32 v131, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v77, v36 +v_mov_b32 v78, v37 +v_mov_b32 v79, v38 +v_mov_b32 v80, v39 +v_mov_b32 v81, v40 +v_mov_b32 v82, v41 +v_mov_b32 v83, v42 +v_mov_b32 v84, v43 +_v_pk_add_f16__vop3p 96, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 97, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 98, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 99, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 100, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 101, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 102, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 103, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 96, 352, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 97, 353, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 98, 354, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 99, 355, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 100, 356, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 101, 357, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 102, 358, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 103, 359, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v96, v44, 0.5, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v97, v45, 0.5, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v98, v46, 0.5, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v99, v47, 0.5, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v100, v48, 0.5, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v101, v49, 0.5, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v102, v50, 0.5, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v103, v51, 0.5, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v113, v44, -1.0, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v114, v45, -1.0, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v115, v46, -1.0, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v116, v47, -1.0, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v117, v48, -1.0, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v118, v49, -1.0, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v119, v50, -1.0, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v120, v51, -1.0, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v132, v61 +v_mov_b32 v133, v62 +v_mov_b32 v134, v63 +v_mov_b32 v135, v64 +v_mov_b32 v136, v65 +v_mov_b32 v137, v66 +v_mov_b32 v138, v67 +v_mov_b32 v139, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s41, v4 +v_mov_b32 v69, v36 +v_mov_b32 v70, v37 +v_mov_b32 v71, v38 +v_mov_b32 v72, v39 +v_mov_b32 v73, v40 +v_mov_b32 v74, v41 +v_mov_b32 v75, v42 +v_mov_b32 v76, v43 +_v_pk_add_f16__vop3p 88, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 89, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 90, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 91, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 92, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 93, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 94, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 95, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 88, 344, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 89, 345, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 90, 346, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 91, 347, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 92, 348, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 93, 349, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 94, 350, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 95, 351, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v88, v44, 0.5, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v89, v45, 0.5, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v90, v46, 0.5, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v91, v47, 0.5, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v92, v48, 0.5, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v93, v49, 0.5, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v94, v50, 0.5, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v95, v51, 0.5, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v105, v44, -1.0, v88 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v106, v45, -1.0, v89 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v107, v46, -1.0, v90 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v108, v47, -1.0, v91 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v109, v48, -1.0, v92 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v110, v49, -1.0, v93 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v111, v50, -1.0, v94 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v112, v51, -1.0, v95 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v124, v61 +v_mov_b32 v125, v62 +v_mov_b32 v126, v63 +v_mov_b32 v127, v64 +v_mov_b32 v128, v65 +v_mov_b32 v129, v66 +v_mov_b32 v130, v67 +v_mov_b32 v131, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_mov_b32 v77, v36 +v_mov_b32 v78, v37 +v_mov_b32 v79, v38 +v_mov_b32 v80, v39 +v_mov_b32 v81, v40 +v_mov_b32 v82, v41 +v_mov_b32 v83, v42 +v_mov_b32 v84, v43 +_v_pk_add_f16__vop3p 96, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 97, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 98, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 99, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 100, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 101, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 102, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 103, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 96, 352, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 97, 353, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 98, 354, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 99, 355, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 100, 356, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 101, 357, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 102, 358, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 103, 359, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v96, v44, 0.5, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v97, v45, 0.5, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v98, v46, 0.5, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v99, v47, 0.5, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v100, v48, 0.5, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v101, v49, 0.5, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v102, v50, 0.5, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v103, v51, 0.5, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v113, v44, -1.0, v96 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v114, v45, -1.0, v97 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v115, v46, -1.0, v98 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v116, v47, -1.0, v99 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v117, v48, -1.0, v100 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v118, v49, -1.0, v101 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v119, v50, -1.0, v102 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v120, v51, -1.0, v103 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v132, v61 +v_mov_b32 v133, v62 +v_mov_b32 v134, v63 +v_mov_b32 v135, v64 +v_mov_b32 v136, v65 +v_mov_b32 v137, v66 +v_mov_b32 v138, v67 +v_mov_b32 v139, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s41, v4 +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s41, v4 +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v1, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 vcc, -1, 0 +s_bitcmp1_b32 s45, 3 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[124:131], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[132:139], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[88:95], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[96:103], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[105:112], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[113:120], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v2, vcc +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[160:163] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[164:167] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +v_bfe_u32 v21, v0, 6, 1 +v_and_b32 v16, 63, v0 +v_cmp_eq_u32 vcc, v21, 1 +v_cndmask_b32 v23, 0, 0x800, vcc +v_cndmask_b32 v21, 0, 0x400, vcc +v_cndmask_b32 v22, 0, 0x100, vcc +v_lshl_add_u32 v14, v16, 3, v23 +v_lshl_add_u32 v17, v16, 2, v22 +v_lshl_add_u32 v18, v16, 2, 0 +v_lshl_add_u32 v16, v16, 4, v21 +s_cmp_eq_u64 s[30:31], 0 +s_cselect_b32 s91, 0, 0x11014000 +s_and_b32 s31, s31, 0xffff +s_add_u32 s31, s31, 0x20000 +s_mov_b64 s[88:89], s[30:31] +s_mov_b32 s90, 0x80000000 +v_and_b32 v21, v0, 63 +v_lshlrev_b32 v21, 1, v21 +v_cmp_lt_u32 vcc, v21, s12 +v_add_nc_u32 v22, v21, 1 +v_cndmask_b32 v21, 0x80000000, v21, vcc +v_cmp_lt_u32 vcc, v22, s12 +v_cndmask_b32 v22, 0x80000000, v22, vcc +buffer_load_d16_b16 v23, v21, s[88:91], 0 idxen +buffer_load_d16_hi_b16 v23, v22, s[88:91], 0 idxen +s_waitcnt vmcnt(0) +v_readlane_b32 s56, v23, 0 +v_readlane_b32 s57, v23, 1 +v_readlane_b32 s59, v23, 2 +v_readlane_b32 s64, v23, 3 +v_readlane_b32 s65, v23, 4 +v_readlane_b32 s66, v23, 5 +v_readlane_b32 s67, v23, 6 +v_readlane_b32 s68, v23, 7 +s_bfe_u32 s88, s58, 0x80000 +s_cmp_eq_u32 s88, 2 +s_cbranch_scc1 20 +s_cmp_eq_u32 s88, 0 +s_cselect_b32 s32, 1.0, s32 +v_cvt_f16_f32 v21, s32 +v_readfirstlane_b32 s32, v21 +v_cvt_f16_f32 v21, s33 +v_readfirstlane_b32 s33, v21 +_v_cmp_gt_f16__vop3_s_lit 106, 32, 0x3c00, 0x0, 0x0 +s_pack_ll_b32_b16 s32, s32, s32 +s_pack_ll_b32_b16 s33, s33, s33 +s_cmp_eq_u32 s88, 3 +s_cbranch_scc1 10 +s_cbranch_vccnz 3 +s_mov_b32 s84, 0x47c4 +s_branch 8 +s_mov_b32 s84, 0x4b5c +s_branch 5 +s_mov_b32 s84, 0x4ef4 +s_branch 2 +s_mov_b32 s84, 0x550c +s_add_u32 s86, s6, 0x3c90 +s_addc_u32 s87, s7, 0 +s_mov_b32 s82, 0xbc00c000 +s_mov_b32 s40, 0x10000 +s_mov_b32 s41, 0x30002 +s_mov_b32 s45, 0x10000 +v_readfirstlane_b32 s88, v0 +s_and_b32 null, 64, s88 +s_cmov_b32 s82, 0x3c00c000 +s_cmov_b32 s40, 0x20003 +s_cmov_b32 s41, 1 +s_cmov_b32 s45, 1 +s_and_b32 s21, s21, 0xffff +s_add_u32 s21, s21, 0x20000 +s_lshl_b32 s80, s51, 1 +s_lshl_b32 s81, s52, 1 +s_mov_b64 s[72:73], s[20:21] +s_mov_b32 s74, 0x80000000 +s_mov_b32 s75, 0 +s_sub_u32 s89, s25, 1 +s_bitcmp1_b32 s14, 1 +s_cselect_b32 s89, s89, 0 +s_cselect_b32 s88, -1, 1 +s_sub_u32 s91, s24, 1 +s_bitcmp1_b32 s14, 0 +s_cselect_b32 s91, s91, 0 +s_cselect_b32 s90, -1, 1 +v_bfe_u32 v24, v0, 6, 1 +v_bfe_u32 v25, v0, 4, 1 +v_bfe_u32 v21, v0, 5, 1 +v_lshl_add_u32 v24, v24, 2, 0 +v_lshl_add_u32 v25, v25, 3, v24 +v_bfe_u32 v23, v0, 2, 2 +v_bfe_u32 v24, v0, 3, 1 +v_xor_b32 v22, v0, v0 quad_perm:[0,0,3,1] +v_lshl_add_u32 v21, v21, 1, v25 +v_xor_b32 v23, v23, v24 +v_add_nc_u32 v24, v21, 1 +v_mad_i32_i16 v19, v23, s88, s89 op_sel:[0,0,0,0] +v_mad_i32_i16 v25, v22, s90, s91 op_sel:[0,0,0,0] +v_mad_u32_u16 v19, v25, s48, v19 op_sel:[0,0,0,0] +v_cmp_lt_u32 vcc, v23, s25 +v_cndmask_b32 v19, 0x80000000, v19, vcc +v_cmp_lt_u32 vcc, v22, s24 +v_cndmask_b32 v19, 0x80000000, v19, vcc +v_mad_u32_u24 v20, v24, s46, v19 +v_mad_u32_u24 v19, v21, s46, v19 +v_cmp_lt_u32 vcc, v24, s12 +v_cndmask_b32 v20, 0x80000000, v20, vcc +v_cmp_lt_u32 vcc, v21, s12 +v_cndmask_b32 v19, 0x80000000, v19, vcc +s_add_u32 s89, s28, 1 +s_lshr_b32 s89, s89, 1 +s_lshl_b32 s90, s89, 1 +s_add_u32 s91, s29, 1 +s_lshr_b32 s91, s91, 1 +s_lshl1_add_u32 s91, s91, 2 +s_pack_ll_b32_b16 s22, s91, s89 +s_pack_ll_b32_b16 s34, s11, s10 +s_sub_u32 s35, s90, s26 +s_sub_u32 s88, s91, s27 +s_pack_ll_b32_b16 s35, s88, s35 +s_pack_ll_b32_b16 s37, s29, s28 +s_sub_u32 s88, s91, 1 +s_pack_ll_b32_b16 s38, s88, s90 +v_lshrrev_b32 v24, 16, s22 +v_bfi_b32 v25, 0xffff, s22, 0 +v_and_b32 v27, 1, v0 +v_bfe_u32 v33, v0, 6, 1 +v_and_b32 v22, 63, v0 +v_mad_u32_u16 v28, 0x7c, s1, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v33, 2, s5, v33 op_sel:[0,0,0,0] +v_mad_u32_u16 v26, v24, v25, 0 op_sel:[0,0,0,0] +v_cmp_eq_u32 vcc, 0, v27 +v_cndmask_b32 v34, v26, v25, vcc +v_mad_u32_u16 v23, 62, v33, v22 op_sel:[0,0,0,0] +v_cndmask_b32 v23, v28, v23, vcc +v_clz_i32_u32 v40, v34 +v_lshlrev_b32 v41, v40, v34 +v_and_b32 v39, 0xffffff00, v41 +v_cmp_eq_u32 vcc, 0x80000000, v41 +v_cvt_f32_u32 v39, v39 +v_rcp_f32 v35, v39 +v_sub_co_ci_u32 v36, vcc, 32, v40, vcc +v_cvt_f32_ubyte0 v40, v41 +v_fma_f32 v39, v39, v35, -1.0 +v_fma_f32 v39, v40, v35, v39 +v_fmaak_f32 v39, v39, v35, 0x9f000000 +v_mul_f32 v39, 0x5f800000, v39 +v_mov_b32 v40, 0 +v_cvt_floor_i32_f32 v39, -v39 +v_lshl_add_u32 v35, v35, 9, v39 +v_mad_u64_u32 v[40:41], vcc, v41, v35, v[40:41] +v_sub_co_ci_u32 v35, vcc, v35, -1, vcc +v_mov_b32 v38, v36 quad_perm:[1,1,1,1] +v_mov_b32 v36, v36 quad_perm:[0,0,0,0] +v_mov_b32 v37, v35 quad_perm:[1,1,1,1] +v_mov_b32 v35, v35 quad_perm:[0,0,0,0] +v_mul_hi_u32 v39, v23, v35 +v_add_co_u32 v21, vcc, v39, v23 +v_add_co_ci_u32 v39, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v36 +v_cndmask_b32 v21, v21, v39, vcc +v_alignbit_b32 v21, v39, v21, v36 +v_mul_hi_u32 v39, v23, v37 +v_add_co_u32 v4, vcc, v39, v23 +v_add_co_ci_u32 v39, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v38 +v_cndmask_b32 v4, v4, v39, vcc +v_alignbit_b32 v4, v39, v4, v38 +v_mad_u32_u16 v32, v21, v25, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v31, v4, v24, 0 op_sel:[0,0,0,0] +v_sub_nc_u32 v32, v23, v32 +v_sub_nc_u32 v31, v21, v31 +v_readlane_b32 s92, v32, 1 +v_sub_nc_u32 v32, v32, v25 +v_readlane_b32 s23, v31, 1 +v_sub_nc_u32 v31, v31, v24 +v_readlane_b32 s15, v4, 1 +v_sub_nc_u32 v4, v4, s8 +s_lshl_b32 s23, s23, 16 +s_and_b32 s92, s92, 0xffff +s_add_u32 s23, s23, s92 +v_mov_b32 v32, v32 quad_perm:[0,0,2,2] +v_mov_b32 v31, v31 quad_perm:[0,0,2,2] +v_mov_b32 v4, v4 quad_perm:[0,0,2,2] +v_add_co_u32 v32, vcc, v32, v27 +v_cndmask_b32 v30, 0, v25, vcc +v_add_co_ci_u32 v31, vcc, v31, 0, vcc +v_cndmask_b32 v29, 0, v24, vcc +v_add_co_ci_u32 v4, vcc, v4, 0, vcc +v_min_u32 v27, v22, 63 +v_sub_nc_u32 v32, v32, v30 +v_sub_nc_u32 v31, v31, v29 +v_cmp_eq_u32 vcc, v22, v27 +v_lshlrev_b32 v5, 16, v31 +v_bfi_b32 v5, 0xffff, v32, v5 +v_add_nc_u32 v42, v4, s8 +v_med3_u32 v27, v22, 1, 62 +v_mul_lo_u32 v6, v42, s42 +v_mul_lo_u32 v11, v42, s50 +s_mul_i32 s36, s15, s42 +s_mul_i32 s39, s15, s50 +v_cndmask_b32 v6, 0x80000000, v6, vcc +v_cmp_eq_u32 vcc, v22, v27 +v_cndmask_b32 v11, 0x80000000, v11, vcc +v_cmp_ge_u32 s[54:55], v42, s8 +v_cndmask_b32 v6, v6, 0x80000000, s[54:55] +v_cndmask_b32 v11, v11, 0x80000000, s[54:55] +s_mov_b32 s49, 3 +s_lshl_b32 s53, s49, 9 +v_add_nc_u32 v15, s53, v14 +s_bfe_u32 s10, s58, 0x80008 +s_bfe_u32 s11, s58, 0x80010 +s_cmp_eq_u32 s11, 0 +s_cmov_b32 s26, 0 +s_cbranch_scc1 108 +s_add_u32 s11, s11, 0xffffff00 +s_add_u32 s60, s60, 0 +s_addc_u32 s61, s61, 0 +s_lshr_b32 s91, s13, 2 +s_or_b32 s91, s91, 0x21010000 +v_cmp_eq_u32 vcc, v0, 0x100 +s_cmp_eq_u64 vcc, 0 +s_cselect_b32 s91, 0, s91 +s_cselect_b32 s90, 0, 0x1010101 +s_sub_u32 s10, 0, s10 +s_mov_b64 s[88:89], s[60:61] +s_and_b32 s89, s89, 0xffff +s_or_b32 s89, s89, 0x40000 +s_and_b32 s29, s22, 0xffff +s_lshr_b32 s28, s22, 16 +s_lshr_b32 s29, s29, 1 +s_mul_i32 s27, s29, s28 +s_mul_i32 s27, s27, s8 +s_add_u32 s27, s27, 61 +v_writelane_b32 v22, 62, 0 +v_writelane_b32 v22, s1, 1 +v_writelane_b32 v22, 10, 2 +v_clz_i32_u32 v26, v22 +v_lshlrev_b32 v27, v26, v22 +v_and_b32 v28, 0xffffff00, v27 +v_cmp_eq_u32 vcc, 0x80000000, v27 +v_cvt_f32_u32 v28, v28 +v_rcp_f32 v24, v28 +v_sub_co_ci_u32 v25, vcc, 32, v26, vcc +v_cvt_f32_ubyte0 v26, v27 +v_fma_f32 v28, v28, v24, -1.0 +v_fma_f32 v28, v26, v24, v28 +v_fmaak_f32 v28, v28, v24, 0x9f000000 +v_mul_f32 v28, 0x5f800000, v28 +v_mov_b32 v26, 0 +v_cvt_floor_i32_f32 v28, -v28 +v_lshl_add_u32 v24, v24, 9, v28 +v_mad_u64_u32 v[26:27], vcc, v27, v24, v[26:27] +v_sub_co_ci_u32 v24, vcc, v24, -1, vcc +v_mul_hi_u32 v26, s27, v24 +v_add_co_u32 v23, vcc, v26, s27 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v23, v23, v26, vcc +v_alignbit_b32 v23, v26, v23, v25 +v_mov_b32 v23, v23 quad_perm:[0,0,0,0] +v_mul_hi_u32 v26, v23, v24 +v_add_co_u32 v22, vcc, v26, v23 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v22, v22, v26, vcc +v_alignbit_b32 v22, v26, v22, v25 +v_mov_b32 v22, v22 quad_perm:[1,1,1,1] +v_add_nc_u32 v23, v22, 9 +v_mul_hi_u32 v26, v23, v24 +v_add_co_u32 v23, vcc, v26, v23 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v23, v23, v26, vcc +v_alignbit_b32 v23, v26, v23, v25 +v_readlane_b32 s28, v22, 1 +v_readlane_b32 s29, v23, 2 +s_add_u32 s27, s9, 15 +s_lshr_b32 s27, s27, 4 +s_cmp_eq_u32 s27, 1 +s_cmov_b32 s29, 1 +s_add_u32 s26, s28, s29 +s_mul_i32 s26, s27, s26 +s_add_u32 s26, 6, s26 +s_sub_u32 s26, s26, 1 +s_mov_b32 s92, 0 +s_mov_b32 s93, 0 +s_mov_b32 s94, 0 +s_mov_b32 s95, 0 +s_mov_b32 s96, 0 +s_mov_b32 s97, 0 +s_mov_b32 s28, 0 +s_mov_b32 s27, 8 +s_cmp_gt_u32 s28, 0 +s_cbranch_scc1 4 +v_mov_b32 v58, v4 +v_mov_b32 v63, v5 +v_mov_b32 v225, v6 +v_mov_b32 v226, v11 +v_mov_b32 v4, v58 +v_mov_b32 v5, v63 +v_mov_b32 v6, v225 +v_mov_b32 v11, v226 +s_add_u32 s28, s28, 16 +s_cmp_ge_u32 s28, s9 +s_cmov_b32 s28, 0 +s_cselect_b32 s29, 6, 2 +s_cselect_b32 s98, 9, 0 +s_pack_lh_b32_b16 s29, s29, s27 +s_pack_ll_b32_b16 s98, s98, s28 +v_mov_b32 v224, s29 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v44, v49, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v45, v50, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v46, v51, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v47, v52, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v7, v19 +v_mov_b32 v8, v20 +v_mov_b32 v9, 0x80000000 +v_mov_b32 v10, 0x80000000 +v_mov_b32 v12, 0x80000000 +v_mov_b32 v13, 0x80000000 +s_setprio 0 +ds_load_b128 v[34:37], v3 +ds_store_b128 v16, v[7:10] offset:37120 +ds_load_b128 v[39:42], v3 offset:576 +ds_store_b32 v17, v224 offset:39168 +s_setprio 2 +s_sub_u32 s26, s26, 1 +s_cselect_b32 s91, 0x21010000, s91 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_add_nc_u32 v15, s53, v14 +v_mov_b32 v245, v243 +v_mov_b32 v246, v244 +v_pk_fma_f16 v227, v34, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v228, v35, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v229, v36, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v230, v37, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 34, 285, 290, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 35, 286, 291, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 36, 287, 292, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 288, 293, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 231, 290, 295, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 291, 296, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 292, 297, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 293, 298, 0x0, 0x3, 0x0, 0x0 +s_setprio 0 +ds_load_b64 v[243:244], v15 offset:39680 +ds_load_b128 v[54:57], v3 offset:2304 +ds_load_b128 v[59:62], v3 offset:2880 +s_setprio 2 +s_mov_b32 s92, s93 +s_mov_b32 s93, s94 +s_mov_b32 s94, s95 +s_mov_b32 s95, s96 +s_mov_b32 s96, s97 +s_mov_b32 s97, s27 +s_bitcmp1_b32 s92, 0 +s_cbranch_scc1 2823 +s_sub_u32 s49, s49, 1 +s_cselect_b32 s49, 3, s49 +s_lshl_b32 s53, s49, 9 +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3c94 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s75, 0x11014000, 0 +s_sub_u32 s69, s12, 1 +s_cselect_b32 s75, 0, s75 +s_mov_b64 s[72:73], s[20:21] +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v235, v54, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v236, v55, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v237, v56, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v238, v57, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 54, 305, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 55, 306, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 56, 307, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 57, 308, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 239, 310, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 311, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 312, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 313, 318, 0x0, 0x3, 0x0, 0x0 +s_add_u32 s11, s11, 0x100 +s_cbranch_scc0 7 +s_bitset0_b32 s91, 23 +s_lshl_b64 exec, 1, s90 +buffer_store_b8 v0, off, s[88:91], s4 +s_mov_b64 exec, -1 +s_mul_i32 s11, s11, 0xffffff01 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 3 +s_bitset1_b32 s91, 23 +buffer_load_b32 v21, off, s[88:91], null glc +s_setprio 0 +s_nop 1 +ds_load_b128 v[24:27], v3 offset:9280 +ds_store_b64 v15, v[12:13] offset:39680 +ds_load_b128 v[29:32], v3 offset:9856 +ds_load_b32 v224, v18 offset:39168 +s_setprio 2 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s27, v224 +v_pk_fma_f16 v24, v29, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v25, v30, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v26, v31, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v27, v32, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +s_setprio 0 +ds_load_b128 v[44:47], v3 offset:11584 +ds_load_b128 v[49:52], v3 offset:12160 +s_setprio 2 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 25 +s_pack_ll_b32_b16 s10, s10, s10 +s_bfm_b64 exec, s91, 0 +v_cmp_ne_u32 vcc, v21, s90 +s_cbranch_vccz 12 +buffer_load_b32 v21, off, s[88:91], null glc +s_cmp_eq_u32 s10, 0 +s_cselect_b32 vcc_lo, 0, 0x10000 +s_add_u32 s10, s10, vcc_lo +s_cbranch_scc1 2 +s_waitcnt vmcnt(0) +s_branch 65524 +s_and_b32 s91, 0xffff0000, s91 +s_mov_b32 s10, 0 +s_mov_b64 exec, -1 +s_mul_i32 s90, s90, 3 +s_and_b32 s90, s90, 0x3f3f3f3f +s_add_u32 s88, s88, 0x100 +s_and_b32 s88, s88, 0xfffff7ff +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3df4 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_cmp_le_u32 s9, 16 +s_cselect_b32 s99, -1, 9 +s_sub_u32 s99, s99, 1 +s_cselect_b32 s29, s98, s29 +s_bitset0_b32 s29, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v44, v49, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v45, v50, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v46, v51, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v47, v52, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v224, s29 +v_add_co_u32 v33, vcc, v5, s23 +v_pk_mad_u16 v23, v5, 0x20001, s35 +v_pk_mad_u16 v28, v5, 0x20001, s38 +_v_pk_min_u16__vop3p 22, 289, 261, 0x0, 0x3, 0x0, 0x0 +v_cndmask_b32 v43, 0, s42, vcc +v_cndmask_b32 v247, 0, s50, vcc +v_mad_u32_u16 v7, v23, 1, v6 op_sel:[0,0,0,0] +v_mad_u32_u16 v12, v28, 1, v11 op_sel:[0,0,0,0] +v_add3_u32 v6, v6, s36, v43 +v_add3_u32 v11, v11, s39, v247 +_v_pk_sub_u16__vop3p 22, 261, 278, 0x0, 0x3, 0x0, 0x0 +v_add_co_ci_u32 v4, s[54:55], v4, s15, vcc +v_cndmask_b32 v6, v6, 0x80000000, s[54:55] +v_cndmask_b32 v11, v11, 0x80000000, s[54:55] +v_cmp_lt_u16 vcc, v23, s34 +v_cndmask_b32 v7, 0x80000000, v7, vcc +v_cmp_lt_u16 vcc, v28, s37 +v_cndmask_b32 v12, 0x80000000, v12, vcc +_v_pk_ashrrev_i16__vop3p 22, 143, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_u16__vop3p 53, 279, 41, 0x1, 0x3, 0x0, 0x0 +_v_pk_add_u16__vop3p 48, 279, 40, 0x1, 0x3, 0x0, 0x0 +v_mad_u32_u16 v10, v53, s44, v7 op_sel:[1,0,0,0] +v_mad_u32_u16 v8, v48, s44, v7 op_sel:[1,0,0,0] +_v_pk_add_u16__vop3p 38, 284, 45, 0x1, 0x3, 0x0, 0x0 +_v_cmp_lt_u16__vop3 106, 53, 34, 0x3 +v_cndmask_b32 v10, 0x80000000, v10, vcc +_v_cmp_lt_u16__vop3 106, 48, 34, 0x3 +v_cndmask_b32 v8, 0x80000000, v8, vcc +v_mad_u32_u16 v13, v38, s52, v12 op_sel:[1,0,0,0] +v_mad_u32_u16 v9, v53, s44, v7 op_sel:[0,0,0,0] +v_mad_u32_u16 v7, v48, s44, v7 op_sel:[0,0,0,0] +_v_cmp_lt_u16__vop3 106, 38, 37, 0x3 +v_cndmask_b32 v13, 0x80000000, v13, vcc +_v_cmp_lt_u16__vop3 106, 53, 34, 0x2 +v_cndmask_b32 v9, 0x80000000, v9, vcc +_v_cmp_lt_u16__vop3 106, 48, 34, 0x2 +v_cndmask_b32 v7, 0x80000000, v7, vcc +v_mad_u32_u16 v12, v38, s52, v12 op_sel:[0,0,0,0] +v_pk_mad_u16 v5, v22, s22, v33 +_v_cmp_lt_u16__vop3 106, 38, 37, 0x2 +v_cndmask_b32 v12, 0x80000000, v12, vcc +v_add_co_u32 v22, vcc, v4, s8 +v_cndmask_b32 v224, s98, v224, vcc +s_setprio 0 +ds_load_b128 v[34:37], v3 +ds_store_b128 v16, v[7:10] offset:37120 +ds_load_b128 v[39:42], v3 offset:576 +ds_store_b32 v17, v224 offset:39168 +s_setprio 2 +s_sub_u32 s26, s26, 1 +s_cselect_b32 s91, 0x21010000, s91 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_add_nc_u32 v15, s53, v14 +v_mov_b32 v245, v243 +v_mov_b32 v246, v244 +v_pk_fma_f16 v227, v34, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v228, v35, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v229, v36, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v230, v37, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 34, 285, 290, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 35, 286, 291, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 36, 287, 292, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 288, 293, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 231, 290, 295, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 291, 296, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 292, 297, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 293, 298, 0x0, 0x3, 0x0, 0x0 +s_setprio 0 +ds_load_b64 v[243:244], v15 offset:39680 +ds_load_b128 v[54:57], v3 offset:2304 +ds_load_b128 v[59:62], v3 offset:2880 +s_setprio 2 +s_mov_b32 s92, s93 +s_mov_b32 s93, s94 +s_mov_b32 s94, s95 +s_mov_b32 s95, s96 +s_mov_b32 s96, s97 +s_mov_b32 s97, s27 +s_bitcmp1_b32 s92, 0 +s_cbranch_scc1 2531 +s_sub_u32 s49, s49, 1 +s_cselect_b32 s49, 3, s49 +s_lshl_b32 s53, s49, 9 +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3c94 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s75, 0x11014000, 0 +s_sub_u32 s69, s12, 1 +s_cselect_b32 s75, 0, s75 +s_mov_b64 s[72:73], s[20:21] +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v235, v54, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v236, v55, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v237, v56, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v238, v57, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 54, 305, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 55, 306, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 56, 307, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 57, 308, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 239, 310, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 311, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 312, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 313, 318, 0x0, 0x3, 0x0, 0x0 +s_add_u32 s11, s11, 0x100 +s_cbranch_scc0 7 +s_bitset0_b32 s91, 23 +s_lshl_b64 exec, 1, s90 +buffer_store_b8 v0, off, s[88:91], s4 +s_mov_b64 exec, -1 +s_mul_i32 s11, s11, 0xffffff01 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 3 +s_bitset1_b32 s91, 23 +buffer_load_b32 v21, off, s[88:91], null glc +s_setprio 0 +s_nop 1 +ds_load_b128 v[24:27], v3 offset:9280 +ds_store_b64 v15, v[12:13] offset:39680 +ds_load_b128 v[29:32], v3 offset:9856 +ds_load_b32 v224, v18 offset:39168 +s_setprio 2 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x3c90 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s27, v224 +v_pk_fma_f16 v24, v29, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v25, v30, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v26, v31, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v27, v32, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +s_setprio 0 +ds_load_b128 v[44:47], v3 offset:11584 +ds_load_b128 v[49:52], v3 offset:12160 +s_setprio 2 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 25 +s_pack_ll_b32_b16 s10, s10, s10 +s_bfm_b64 exec, s91, 0 +v_cmp_ne_u32 vcc, v21, s90 +s_cbranch_vccz 12 +buffer_load_b32 v21, off, s[88:91], null glc +s_cmp_eq_u32 s10, 0 +s_cselect_b32 vcc_lo, 0, 0x10000 +s_add_u32 s10, s10, vcc_lo +s_cbranch_scc1 2 +s_waitcnt vmcnt(0) +s_branch 65524 +s_and_b32 s91, 0xffff0000, s91 +s_mov_b32 s10, 0 +s_mov_b64 exec, -1 +s_mul_i32 s90, s90, 3 +s_and_b32 s90, s90, 0x3f3f3f3f +s_add_u32 s88, s88, 0x100 +s_and_b32 s88, s88, 0xfffff7ff +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x3df4 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s27, 1 +s_cbranch_scc1 65244 +s_branch 65016 +s_setpc_b64 s[86:87] +s_bitcmp1_b32 s92, 3 +s_cbranch_scc0 80 +v_mov_b32 v64, 0 +v_mov_b32 v68, 0 +v_mov_b32 v65, 0 +v_mov_b32 v69, 0 +v_mov_b32 v66, 0 +v_mov_b32 v70, 0 +v_mov_b32 v67, 0 +v_mov_b32 v71, 0 +v_mov_b32 v80, 0 +v_mov_b32 v84, 0 +v_mov_b32 v81, 0 +v_mov_b32 v85, 0 +v_mov_b32 v82, 0 +v_mov_b32 v86, 0 +v_mov_b32 v83, 0 +v_mov_b32 v87, 0 +v_mov_b32 v96, 0 +v_mov_b32 v100, 0 +v_mov_b32 v97, 0 +v_mov_b32 v101, 0 +v_mov_b32 v98, 0 +v_mov_b32 v102, 0 +v_mov_b32 v99, 0 +v_mov_b32 v103, 0 +v_mov_b32 v112, 0 +v_mov_b32 v116, 0 +v_mov_b32 v113, 0 +v_mov_b32 v117, 0 +v_mov_b32 v114, 0 +v_mov_b32 v118, 0 +v_mov_b32 v115, 0 +v_mov_b32 v119, 0 +v_mov_b32 v128, 0 +v_mov_b32 v132, 0 +v_mov_b32 v129, 0 +v_mov_b32 v133, 0 +v_mov_b32 v130, 0 +v_mov_b32 v134, 0 +v_mov_b32 v131, 0 +v_mov_b32 v135, 0 +v_mov_b32 v144, 0 +v_mov_b32 v148, 0 +v_mov_b32 v145, 0 +v_mov_b32 v149, 0 +v_mov_b32 v146, 0 +v_mov_b32 v150, 0 +v_mov_b32 v147, 0 +v_mov_b32 v151, 0 +v_mov_b32 v160, 0 +v_mov_b32 v164, 0 +v_mov_b32 v161, 0 +v_mov_b32 v165, 0 +v_mov_b32 v162, 0 +v_mov_b32 v166, 0 +v_mov_b32 v163, 0 +v_mov_b32 v167, 0 +v_mov_b32 v176, 0 +v_mov_b32 v180, 0 +v_mov_b32 v177, 0 +v_mov_b32 v181, 0 +v_mov_b32 v178, 0 +v_mov_b32 v182, 0 +v_mov_b32 v179, 0 +v_mov_b32 v183, 0 +v_mov_b32 v192, 0 +v_mov_b32 v196, 0 +v_mov_b32 v193, 0 +v_mov_b32 v197, 0 +v_mov_b32 v194, 0 +v_mov_b32 v198, 0 +v_mov_b32 v195, 0 +v_mov_b32 v199, 0 +v_mov_b32 v208, 0 +v_mov_b32 v212, 0 +v_mov_b32 v209, 0 +v_mov_b32 v213, 0 +v_mov_b32 v210, 0 +v_mov_b32 v214, 0 +v_mov_b32 v211, 0 +v_mov_b32 v215, 0 +s_mov_b32 s85, 0x3f54 +s_cmp_le_u32 s9, 16 +s_cmov_b32 s85, 0x3c90 +s_setpc_b64 s[86:87] +s_bitcmp1_b32 s92, 3 +s_cbranch_scc0 80 +v_mov_b32 v72, 0 +v_mov_b32 v76, 0 +v_mov_b32 v73, 0 +v_mov_b32 v77, 0 +v_mov_b32 v74, 0 +v_mov_b32 v78, 0 +v_mov_b32 v75, 0 +v_mov_b32 v79, 0 +v_mov_b32 v88, 0 +v_mov_b32 v92, 0 +v_mov_b32 v89, 0 +v_mov_b32 v93, 0 +v_mov_b32 v90, 0 +v_mov_b32 v94, 0 +v_mov_b32 v91, 0 +v_mov_b32 v95, 0 +v_mov_b32 v104, 0 +v_mov_b32 v108, 0 +v_mov_b32 v105, 0 +v_mov_b32 v109, 0 +v_mov_b32 v106, 0 +v_mov_b32 v110, 0 +v_mov_b32 v107, 0 +v_mov_b32 v111, 0 +v_mov_b32 v120, 0 +v_mov_b32 v124, 0 +v_mov_b32 v121, 0 +v_mov_b32 v125, 0 +v_mov_b32 v122, 0 +v_mov_b32 v126, 0 +v_mov_b32 v123, 0 +v_mov_b32 v127, 0 +v_mov_b32 v136, 0 +v_mov_b32 v140, 0 +v_mov_b32 v137, 0 +v_mov_b32 v141, 0 +v_mov_b32 v138, 0 +v_mov_b32 v142, 0 +v_mov_b32 v139, 0 +v_mov_b32 v143, 0 +v_mov_b32 v152, 0 +v_mov_b32 v156, 0 +v_mov_b32 v153, 0 +v_mov_b32 v157, 0 +v_mov_b32 v154, 0 +v_mov_b32 v158, 0 +v_mov_b32 v155, 0 +v_mov_b32 v159, 0 +v_mov_b32 v168, 0 +v_mov_b32 v172, 0 +v_mov_b32 v169, 0 +v_mov_b32 v173, 0 +v_mov_b32 v170, 0 +v_mov_b32 v174, 0 +v_mov_b32 v171, 0 +v_mov_b32 v175, 0 +v_mov_b32 v184, 0 +v_mov_b32 v188, 0 +v_mov_b32 v185, 0 +v_mov_b32 v189, 0 +v_mov_b32 v186, 0 +v_mov_b32 v190, 0 +v_mov_b32 v187, 0 +v_mov_b32 v191, 0 +v_mov_b32 v200, 0 +v_mov_b32 v204, 0 +v_mov_b32 v201, 0 +v_mov_b32 v205, 0 +v_mov_b32 v202, 0 +v_mov_b32 v206, 0 +v_mov_b32 v203, 0 +v_mov_b32 v207, 0 +v_mov_b32 v216, 0 +v_mov_b32 v220, 0 +v_mov_b32 v217, 0 +v_mov_b32 v221, 0 +v_mov_b32 v218, 0 +v_mov_b32 v222, 0 +v_mov_b32 v219, 0 +v_mov_b32 v223, 0 +s_mov_b32 s85, 0x3f54 +s_cmp_le_u32 s9, 16 +s_cmov_b32 s85, 0x3c90 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 325, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 326, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 327, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v64, v227 +v_mov_b32 v65, v228 +v_mov_b32 v66, v229 +v_mov_b32 v67, v230 +v_mov_b32 v68, v231 +v_mov_b32 v69, v232 +v_mov_b32 v70, v233 +v_mov_b32 v71, v234 +s_mov_b32 s85, 0x3fc0 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 328, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 329, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 330, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 331, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 332, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 333, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 334, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 335, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v72, v235 +v_mov_b32 v73, v236 +v_mov_b32 v74, v237 +v_mov_b32 v75, v238 +v_mov_b32 v76, v239 +v_mov_b32 v77, v240 +v_mov_b32 v78, v241 +v_mov_b32 v79, v242 +s_mov_b32 s85, 0x402c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 336, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 337, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 338, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 339, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 340, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 341, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 342, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 343, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v80, v227 +v_mov_b32 v81, v228 +v_mov_b32 v82, v229 +v_mov_b32 v83, v230 +v_mov_b32 v84, v231 +v_mov_b32 v85, v232 +v_mov_b32 v86, v233 +v_mov_b32 v87, v234 +s_mov_b32 s85, 0x4098 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 344, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 345, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 346, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 347, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 348, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 349, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 350, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 351, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v88, v235 +v_mov_b32 v89, v236 +v_mov_b32 v90, v237 +v_mov_b32 v91, v238 +v_mov_b32 v92, v239 +v_mov_b32 v93, v240 +v_mov_b32 v94, v241 +v_mov_b32 v95, v242 +s_mov_b32 s85, 0x4104 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 352, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 353, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 354, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 355, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 356, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 357, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 358, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 359, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v96, v227 +v_mov_b32 v97, v228 +v_mov_b32 v98, v229 +v_mov_b32 v99, v230 +v_mov_b32 v100, v231 +v_mov_b32 v101, v232 +v_mov_b32 v102, v233 +v_mov_b32 v103, v234 +s_mov_b32 s85, 0x4170 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 360, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 361, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 362, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 363, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 364, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 365, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 366, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 367, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v104, v235 +v_mov_b32 v105, v236 +v_mov_b32 v106, v237 +v_mov_b32 v107, v238 +v_mov_b32 v108, v239 +v_mov_b32 v109, v240 +v_mov_b32 v110, v241 +v_mov_b32 v111, v242 +s_mov_b32 s85, 0x41dc +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 368, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 369, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 370, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 371, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 372, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 373, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 374, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 375, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v112, v227 +v_mov_b32 v113, v228 +v_mov_b32 v114, v229 +v_mov_b32 v115, v230 +v_mov_b32 v116, v231 +v_mov_b32 v117, v232 +v_mov_b32 v118, v233 +v_mov_b32 v119, v234 +s_mov_b32 s85, 0x4248 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 376, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 377, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 378, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 379, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 380, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 381, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 382, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 383, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v120, v235 +v_mov_b32 v121, v236 +v_mov_b32 v122, v237 +v_mov_b32 v123, v238 +v_mov_b32 v124, v239 +v_mov_b32 v125, v240 +v_mov_b32 v126, v241 +v_mov_b32 v127, v242 +s_mov_b32 s85, 0x42b4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 384, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 385, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 386, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 387, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 388, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 389, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 390, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 391, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v128, v227 +v_mov_b32 v129, v228 +v_mov_b32 v130, v229 +v_mov_b32 v131, v230 +v_mov_b32 v132, v231 +v_mov_b32 v133, v232 +v_mov_b32 v134, v233 +v_mov_b32 v135, v234 +s_mov_b32 s85, 0x4320 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 392, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 393, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 394, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 395, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 396, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 397, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 398, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 399, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v136, v235 +v_mov_b32 v137, v236 +v_mov_b32 v138, v237 +v_mov_b32 v139, v238 +v_mov_b32 v140, v239 +v_mov_b32 v141, v240 +v_mov_b32 v142, v241 +v_mov_b32 v143, v242 +s_mov_b32 s85, 0x438c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 400, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 401, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 402, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 403, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 404, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 405, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 406, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 407, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v144, v227 +v_mov_b32 v145, v228 +v_mov_b32 v146, v229 +v_mov_b32 v147, v230 +v_mov_b32 v148, v231 +v_mov_b32 v149, v232 +v_mov_b32 v150, v233 +v_mov_b32 v151, v234 +s_mov_b32 s85, 0x43f8 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 408, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 409, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 410, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 411, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 412, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 413, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 414, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 415, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v152, v235 +v_mov_b32 v153, v236 +v_mov_b32 v154, v237 +v_mov_b32 v155, v238 +v_mov_b32 v156, v239 +v_mov_b32 v157, v240 +v_mov_b32 v158, v241 +v_mov_b32 v159, v242 +s_mov_b32 s85, 0x4464 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 416, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 417, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 418, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 419, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 420, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 421, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 422, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 423, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v160, v227 +v_mov_b32 v161, v228 +v_mov_b32 v162, v229 +v_mov_b32 v163, v230 +v_mov_b32 v164, v231 +v_mov_b32 v165, v232 +v_mov_b32 v166, v233 +v_mov_b32 v167, v234 +s_mov_b32 s85, 0x44d0 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 424, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 425, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 426, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 427, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 428, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 429, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 430, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 431, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v168, v235 +v_mov_b32 v169, v236 +v_mov_b32 v170, v237 +v_mov_b32 v171, v238 +v_mov_b32 v172, v239 +v_mov_b32 v173, v240 +v_mov_b32 v174, v241 +v_mov_b32 v175, v242 +s_mov_b32 s85, 0x453c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 432, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 433, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 434, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 435, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 436, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 437, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 438, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 439, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v176, v227 +v_mov_b32 v177, v228 +v_mov_b32 v178, v229 +v_mov_b32 v179, v230 +v_mov_b32 v180, v231 +v_mov_b32 v181, v232 +v_mov_b32 v182, v233 +v_mov_b32 v183, v234 +s_mov_b32 s85, 0x45a8 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 440, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 441, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 442, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 443, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 444, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 445, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 446, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 447, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v184, v235 +v_mov_b32 v185, v236 +v_mov_b32 v186, v237 +v_mov_b32 v187, v238 +v_mov_b32 v188, v239 +v_mov_b32 v189, v240 +v_mov_b32 v190, v241 +v_mov_b32 v191, v242 +s_mov_b32 s85, 0x4614 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 448, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 449, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 450, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 451, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 452, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 453, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 454, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 455, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v192, v227 +v_mov_b32 v193, v228 +v_mov_b32 v194, v229 +v_mov_b32 v195, v230 +v_mov_b32 v196, v231 +v_mov_b32 v197, v232 +v_mov_b32 v198, v233 +v_mov_b32 v199, v234 +s_mov_b32 s85, 0x4680 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 456, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 457, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 458, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 459, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 460, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 461, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 462, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 463, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v200, v235 +v_mov_b32 v201, v236 +v_mov_b32 v202, v237 +v_mov_b32 v203, v238 +v_mov_b32 v204, v239 +v_mov_b32 v205, v240 +v_mov_b32 v206, v241 +v_mov_b32 v207, v242 +s_mov_b32 s85, 0x46ec +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 464, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 465, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 466, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 467, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 468, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 469, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 470, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 471, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v208, v227 +v_mov_b32 v209, v228 +v_mov_b32 v210, v229 +v_mov_b32 v211, v230 +v_mov_b32 v212, v231 +v_mov_b32 v213, v232 +v_mov_b32 v214, v233 +v_mov_b32 v215, v234 +s_mov_b32 s85, 0x4758 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 472, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 473, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 474, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 475, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 476, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 477, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 478, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 479, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v216, v235 +v_mov_b32 v217, v236 +v_mov_b32 v218, v237 +v_mov_b32 v219, v238 +v_mov_b32 v220, v239 +v_mov_b32 v221, v240 +v_mov_b32 v222, v241 +v_mov_b32 v223, v242 +s_mov_b32 s85, 0x3f54 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 483, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 484, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 485, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 486, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 227, 483, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 228, 484, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 229, 485, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 230, 486, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 487, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 488, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 489, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 490, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 231, 487, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 232, 488, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 233, 489, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 234, 490, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x4990 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 491, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 492, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 493, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 494, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 235, 491, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 236, 492, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 237, 493, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 238, 494, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 495, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 496, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 497, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 498, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 239, 495, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 240, 496, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 241, 497, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 242, 498, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x47c4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 483, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 484, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 485, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 486, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 227, 483, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 228, 484, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 229, 485, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 230, 486, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 487, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 488, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 489, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 490, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 231, 487, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 232, 488, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 233, 489, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 234, 490, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x4d28 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 491, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 492, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 493, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 494, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 235, 491, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 236, 492, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 237, 493, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 238, 494, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 495, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 496, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 497, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 498, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 239, 495, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 240, 496, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 241, 497, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 242, 498, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x4b5c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 227, 0xbdc5bdc5, 483, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 228, 0xbdc5bdc5, 484, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 229, 0xbdc5bdc5, 485, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 230, 0xbdc5bdc5, 486, 0x0, 0x3 +v_exp_f16 v227, v227 +v_exp_f16 v228, v228 +v_exp_f16 v229, v229 +v_exp_f16 v230, v230 +_v_exp_f16__vop3 227, 227, 0x9 +_v_exp_f16__vop3 228, 228, 0x9 +_v_exp_f16__vop3 229, 229, 0x9 +_v_exp_f16__vop3 230, 230, 0x9 +_v_pk_add_f16__vop3p_lit 227, 0x3c003c00, 483, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 228, 0x3c003c00, 484, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 229, 0x3c003c00, 485, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 230, 0x3c003c00, 486, 0x0, 0x3 +v_rcp_f16 v227, v227 +v_rcp_f16 v228, v228 +v_rcp_f16 v229, v229 +v_rcp_f16 v230, v230 +_v_rcp_f16__vop3 227, 227, 0x9 +_v_rcp_f16__vop3 228, 228, 0x9 +_v_rcp_f16__vop3 229, 229, 0x9 +_v_rcp_f16__vop3 230, 230, 0x9 +_v_pk_mul_f16__vop3p_lit 231, 0xbdc5bdc5, 487, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 232, 0xbdc5bdc5, 488, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 233, 0xbdc5bdc5, 489, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 234, 0xbdc5bdc5, 490, 0x0, 0x3 +v_exp_f16 v231, v231 +v_exp_f16 v232, v232 +v_exp_f16 v233, v233 +v_exp_f16 v234, v234 +_v_exp_f16__vop3 231, 231, 0x9 +_v_exp_f16__vop3 232, 232, 0x9 +_v_exp_f16__vop3 233, 233, 0x9 +_v_exp_f16__vop3 234, 234, 0x9 +_v_pk_add_f16__vop3p_lit 231, 0x3c003c00, 487, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 232, 0x3c003c00, 488, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 233, 0x3c003c00, 489, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 234, 0x3c003c00, 490, 0x0, 0x3 +v_rcp_f16 v231, v231 +v_rcp_f16 v232, v232 +v_rcp_f16 v233, v233 +v_rcp_f16 v234, v234 +_v_rcp_f16__vop3 231, 231, 0x9 +_v_rcp_f16__vop3 232, 232, 0x9 +_v_rcp_f16__vop3 233, 233, 0x9 +_v_rcp_f16__vop3 234, 234, 0x9 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x5200 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 235, 0xbdc5bdc5, 491, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 236, 0xbdc5bdc5, 492, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 237, 0xbdc5bdc5, 493, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 238, 0xbdc5bdc5, 494, 0x0, 0x3 +v_exp_f16 v235, v235 +v_exp_f16 v236, v236 +v_exp_f16 v237, v237 +v_exp_f16 v238, v238 +_v_exp_f16__vop3 235, 235, 0x9 +_v_exp_f16__vop3 236, 236, 0x9 +_v_exp_f16__vop3 237, 237, 0x9 +_v_exp_f16__vop3 238, 238, 0x9 +_v_pk_add_f16__vop3p_lit 235, 0x3c003c00, 491, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 236, 0x3c003c00, 492, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 237, 0x3c003c00, 493, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 238, 0x3c003c00, 494, 0x0, 0x3 +v_rcp_f16 v235, v235 +v_rcp_f16 v236, v236 +v_rcp_f16 v237, v237 +v_rcp_f16 v238, v238 +_v_rcp_f16__vop3 235, 235, 0x9 +_v_rcp_f16__vop3 236, 236, 0x9 +_v_rcp_f16__vop3 237, 237, 0x9 +_v_rcp_f16__vop3 238, 238, 0x9 +_v_pk_mul_f16__vop3p_lit 239, 0xbdc5bdc5, 495, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 240, 0xbdc5bdc5, 496, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 241, 0xbdc5bdc5, 497, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 242, 0xbdc5bdc5, 498, 0x0, 0x3 +v_exp_f16 v239, v239 +v_exp_f16 v240, v240 +v_exp_f16 v241, v241 +v_exp_f16 v242, v242 +_v_exp_f16__vop3 239, 239, 0x9 +_v_exp_f16__vop3 240, 240, 0x9 +_v_exp_f16__vop3 241, 241, 0x9 +_v_exp_f16__vop3 242, 242, 0x9 +_v_pk_add_f16__vop3p_lit 239, 0x3c003c00, 495, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 240, 0x3c003c00, 496, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 241, 0x3c003c00, 497, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 242, 0x3c003c00, 498, 0x0, 0x3 +v_rcp_f16 v239, v239 +v_rcp_f16 v240, v240 +v_rcp_f16 v241, v241 +v_rcp_f16 v242, v242 +_v_rcp_f16__vop3 239, 239, 0x9 +_v_rcp_f16__vop3 240, 240, 0x9 +_v_rcp_f16__vop3 241, 241, 0x9 +_v_rcp_f16__vop3 242, 242, 0x9 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x4ef4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 227, 483, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 228, 484, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 229, 485, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 230, 486, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v227 +v_and_b32 v28, 0x7fff7fff, v228 +v_and_b32 v38, 0x7fff7fff, v229 +v_and_b32 v48, 0x7fff7fff, v230 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 227, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 228, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 229, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 230, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 227, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 228, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 229, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 230, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v227, 0x7fff7fff, v23, v227 +v_bfi_b32 v228, 0x7fff7fff, v33, v228 +v_bfi_b32 v229, 0x7fff7fff, v43, v229 +v_bfi_b32 v230, 0x7fff7fff, v53, v230 +_v_pk_mul_f16__vop3p 227, 483, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 228, 484, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 229, 485, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 230, 486, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 231, 487, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 232, 488, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 489, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 490, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v231 +v_and_b32 v28, 0x7fff7fff, v232 +v_and_b32 v38, 0x7fff7fff, v233 +v_and_b32 v48, 0x7fff7fff, v234 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 231, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 232, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 233, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 234, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 231, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 232, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 233, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 234, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v231, 0x7fff7fff, v23, v231 +v_bfi_b32 v232, 0x7fff7fff, v33, v232 +v_bfi_b32 v233, 0x7fff7fff, v43, v233 +v_bfi_b32 v234, 0x7fff7fff, v53, v234 +_v_pk_mul_f16__vop3p 231, 487, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 232, 488, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 489, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 490, 32, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x5b98 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 235, 491, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 236, 492, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 237, 493, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 238, 494, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v235 +v_and_b32 v28, 0x7fff7fff, v236 +v_and_b32 v38, 0x7fff7fff, v237 +v_and_b32 v48, 0x7fff7fff, v238 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 235, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 236, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 237, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 238, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 235, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 236, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 237, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 238, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v235, 0x7fff7fff, v23, v235 +v_bfi_b32 v236, 0x7fff7fff, v33, v236 +v_bfi_b32 v237, 0x7fff7fff, v43, v237 +v_bfi_b32 v238, 0x7fff7fff, v53, v238 +_v_pk_mul_f16__vop3p 235, 491, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 236, 492, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 237, 493, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 238, 494, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 239, 495, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 240, 496, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 241, 497, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 242, 498, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v239 +v_and_b32 v28, 0x7fff7fff, v240 +v_and_b32 v38, 0x7fff7fff, v241 +v_and_b32 v48, 0x7fff7fff, v242 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 239, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 240, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 241, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 242, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 239, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 240, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 241, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 242, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v239, 0x7fff7fff, v23, v239 +v_bfi_b32 v240, 0x7fff7fff, v33, v240 +v_bfi_b32 v241, 0x7fff7fff, v43, v241 +v_bfi_b32 v242, 0x7fff7fff, v53, v242 +_v_pk_mul_f16__vop3p 239, 495, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 240, 496, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 241, 497, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 242, 498, 32, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x550c +s_setpc_b64 s[86:87] +s_endpgm +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end + diff --git a/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc new file mode 100644 index 0000000000..4934d4900f --- /dev/null +++ b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc @@ -0,0 +1,5160 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +.macro _sop1_lit op:req, sdst:req, lit:req + .long (0b101111101 << 23) | (\sdst << 16) | (\op << 8) | 255 + .long \lit +.endm + +.macro _s_mov_b32__sop1_lit sdst:req, lit:req + _sop1_lit 0, \sdst, \lit +.endm + +.macro _vop1 op:req, vdst:req, src:req + .long (0b0111111 << 25) | (\vdst << 17) | (\op << 9) | \src +.endm + +.macro _v_cvt_f16_i16__vop1 vdst:req, vsrc:req + _vop1 81, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _v_rcp_f16__vop1 vdst:req, vsrc:req + _vop1 84, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _v_exp_f16__vop1 vdst:req, vsrc:req + _vop1 88, \vdst, (\vsrc + /*VGPR*/ 256) +.endm + +.macro _vop3 op:req, vdst:req, src0:req, src1:req, src2:req, opsel:req, abs:req, neg:req + .long (0b110101 << 26) | (\op << 16) | (\opsel << 11) | (\abs << 8) | \vdst + .long (\neg << 29) | (\src2 << 18) | (\src1 << 9) | \src0 +.endm + +.macro _vop3_lit op:req, vdst:req, src0:req, src1:req, src2:req, lit:req, opsel:req, abs:req, neg:req + .long (0b110101 << 26) | (\op << 16) | (\opsel << 11) | (\abs << 8) | \vdst + .long (\neg << 29) | (\src2 << 18) | (\src1 << 9) | \src0 + .long \lit +.endm + +.macro _v_cvt_f16_i16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 465, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_rcp_f16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 468, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_exp_f16__vop3 vdst:req, vsrc:req, opsel:req + _vop3 472, \vdst, (\vsrc + /*VGPR*/ 256), 0, 0, \opsel, 0, 0 +.endm + +.macro _v_cndmask_b16__vop3 vdst:req, vsrc0:req, vsrc1:req, src2:req, opsel:req + _vop3 605, \vdst, (\vsrc0 + /*VGPR*/ 256), (\vsrc1 + /*VGPR*/ 256), \src2, \opsel, 0, 0 +.endm + +.macro _v_cmp_gt_f16__vop3_s_lit sdst:req, ssrc0:req, lit:req, opsel:req, abs:req + _vop3_lit 4, \sdst, \ssrc0, 255, 0, \lit, \opsel, \abs, 0 +.endm + +.macro _v_cmp_gt_f16__vop3_v_lit sdst:req, vsrc0:req, lit:req, opsel:req, abs:req + _vop3_lit 4, \sdst, (\vsrc0 + /*VGPR*/ 256), 255, 0, \lit, \opsel, \abs, 0 +.endm + +.macro _v_cmp_lt_u16__vop3 sdst:req, vsrc0:req, ssrc1:req, opsel:req + _vop3 57, \sdst, (\vsrc0 + /*VGPR*/ 256), \ssrc1, 0, \opsel, 0, 0 +.endm + +.macro _v_cmpx_lt_u32__vop3 sdst:req, vsrc0:req, ssrc1:req + _vop3 201, \sdst, (\vsrc0 + /*VGPR*/ 256), \ssrc1, 0, 0, 0, 0 +.endm + +.macro _vop3p op:req, vdst:req, src0:req, src1:req, src2:req, opsel:req, opsel_hi:req, opsel_hi2:req, neg:req, neg_hi:req + .long (0b11001100 << 24) | (\op << 16) | (\opsel_hi2 << 14) | (\opsel << 11) | (\neg_hi << 8) | \vdst + .long (\neg << 29) | (\opsel_hi << 27) | (\src2 << 18) | (\src1 << 9) | \src0 +.endm + +.macro _vop3p_lit op:req, vdst:req, src0:req, src1:req, src2:req, lit:req, opsel:req, opsel_hi:req, opsel_hi2:req, neg:req, neg_hi:req + .long (0b11001100 << 24) | (\op << 16) | (\opsel_hi2 << 14) | (\opsel << 11) | (\neg_hi << 8) | \vdst + .long (\neg << 29) | (\opsel_hi << 27) | (\src2 << 18) | (\src1 << 9) | \src0 + .long \lit +.endm + +.macro _v_pk_ashrrev_i16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 6, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 10, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_sub_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 11, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_min_u16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 13, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 15, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_add_f16__vop3p_lit vdst:req, lit:req, src1:req, opsel:req, opsel_hi:req + _vop3p_lit 15, \vdst, 255, \src1, 0, \lit, \opsel, \opsel_hi, 0, 0, 0 +.endm + +.macro _v_pk_mul_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 16, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_mul_f16__vop3p_lit vdst:req, lit:req, src1:req, opsel:req, opsel_hi:req + _vop3p_lit 16, \vdst, 255, \src1, 0, \lit, \opsel, \opsel_hi, 0, 0, 0 +.endm + +.macro _v_pk_min_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 17, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +.macro _v_pk_max_f16__vop3p vdst:req, src0:req, src1:req, opsel:req, opsel_hi:req, neg:req, neg_hi:req + _vop3p 18, \vdst, \src0, \src1, 0, \opsel, \opsel_hi, 0, \neg, \neg_hi +.endm + +s_version 0x2006 +s_set_inst_prefetch_distance 0x3 +s_mov_b32 s0, 0 +v_lshlrev_b32 v1, 7, v0 +s_getpc_b64 s[8:9] +s_mov_b32 s10, 0x70cc +s_mov_b32 s11, 0x31014000 +buffer_load_b32 v2, v1, s[8:11], 0 offen +s_waitcnt vmcnt(0) +s_getpc_b64 s[6:7] +s_load_b512 s[8:23], s[2:3], null +s_load_b512 s[24:39], s[2:3], 0x40 +s_load_b512 s[40:55], s[2:3], 0x80 +s_load_b256 s[56:63], s[2:3], 0xc0 +s_load_b64 s[64:65], s[2:3], 0xe0 +v_and_b32 v8, 0xff, v0 +v_lshrrev_b32 v9, 1, v8 +v_and_b32 v10, 1, v0 +v_add_nc_u32 v5, v9, 32 +v_bfi_b32 v6, 31, v8, v9 +v_bfe_u32 v4, v8, 5, 1 +v_bfi_b32 v6, 0xbf, v6, v5 +v_and_b32 v2, 31, v8 +v_lshrrev_b32 v6, 5, v6 +v_lshrrev_b32 v7, 6, v8 +v_lshlrev_b32 v2, 4, v2 +v_and_b32 v3, 31, v9 +v_mad_u32_u24 v2, v4, 0x900, v2 +v_lshlrev_b32 v3, 4, v3 +v_xor_b32 v5, 3, v6 +v_mad_u32_u16 v3, 0x480, v7, v3 op_sel:[0,0,0,0] +v_mad_u32_u24 v1, v5, 0x240, v2 +v_mad_u32_u16 v3, 0x1240, v10, v3 op_sel:[0,0,0,0] +v_mad_u32_u24 v2, v6, 0x240, v2 +s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) +s_bitcmp1_b32 s14, 6 +s_cbranch_scc0 14 +s_load_b64 s[16:17], s[16:17], null +s_load_b64 s[20:21], s[20:21], null +s_load_b64 s[18:19], s[18:19], null +s_cmp_eq_u64 0, s[60:61] +s_cbranch_scc1 2 +s_load_b64 s[60:61], s[60:61], null +s_cmp_eq_u64 0, s[30:31] +s_cbranch_scc1 2 +s_load_b64 s[30:31], s[30:31], null +s_bitcmp1_b32 s14, 3 +s_cbranch_scc0 2 +s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xf0 +s_cmp_eq_u32 s13, 0x60 +s_cbranch_scc0 16 +s_mul_i32 s1, s4, 0xab +s_lshr_b32 s1, s1, 10 +s_mul_i32 s23, s1, 6 +s_sub_u32 s23, s4, s23 +s_bfe_u32 s15, s1, 0x20000 +s_bfe_u32 s22, s1, 0x10002 +s_bfe_u32 s5, s1, 0x10003 +s_mov_b32 s45, s23 +s_lshl1_add_u32 s45, s45, s22 +s_lshl2_add_u32 s45, s45, s15 +s_lshl1_add_u32 s45, s45, s5 +s_mov_b32 s4, s45 +s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) +s_bitcmp1_b32 s14, 13 +s_cbranch_scc0 10 +s_add_u32 s16, s16, s34 +s_addc_u32 s17, s17, s35 +s_add_u32 s20, s20, s38 +s_addc_u32 s21, s21, s39 +s_add_u32 s18, s18, s36 +s_addc_u32 s19, s19, s37 +s_cmp_eq_u64 0, s[30:31] +s_cselect_b64 s[40:41], 0, s[40:41] +s_add_u32 s30, s30, s40 +s_addc_u32 s31, s31, s41 +s_add_u32 s15, s12, 15 +s_lshr_b32 s15, s15, 4 +v_cvt_f32_u32 v4, s15 +v_rcp_f32 v4, v4 +v_mul_f32 v4, 0x47800000, v4 +v_cvt_floor_i32_f32 v4, v4 +v_mad_u32_u24 v5, v4, s13, s13 +v_lshrrev_b32 v5, 16, v5 +v_cvt_f32_u32 v4, v5 +v_rcp_f32 v4, v4 +v_mul_f32 v4, 0x47800000, v4 +v_cvt_floor_i32_f32 v4, v4 +v_mad_u32_u24 v6, v4, s4, s4 +v_lshrrev_b32 v6, 16, v6 +v_readfirstlane_b32 s1, v5 +v_readfirstlane_b32 s22, v6 +s_mul_i32 s5, s22, s1 +s_sub_u32 s5, s4, s5 +s_cmp_ge_u32 s22, s15 +s_cbranch_scc1 7089 +s_mul_i32 s13, s1, s15 +s_mul_i32 s23, s22, 16 +s_sub_u32 s12, s12, s23 +s_min_u32 s12, s12, 16 +s_mul_i32 s34, s23, s46 +s_mul_hi_u32 s35, s23, s46 +s_lshl_b64 s[34:35], s[34:35], 1 +s_add_u32 s18, s34, s18 +s_addc_u32 s19, s35, s19 +s_lshr_b32 s35, s23, 0 +s_mul_i32 s34, s35, s51 +s_mul_hi_u32 s35, s35, s51 +s_lshl_b64 s[34:35], s[34:35], 1 +s_add_u32 s20, s34, s20 +s_addc_u32 s21, s35, s21 +s_lshl_b32 s34, s23, 1 +s_cmp_eq_u64 s[30:31], 0 +s_cselect_b32 s34, 0, s34 +s_add_u32 s30, s30, s34 +s_addc_u32 s31, s31, 0 +v_cmp_lt_u32 vcc, v0, 0x100 +s_cbranch_vccz 3677 +v_and_b32 v20, 0xff, v0 +v_lshrrev_b32 v21, 1, v20 +v_bfe_u32 v17, v20, 3, 1 +v_bfe_u32 v16, v20, 2, 1 +v_mad_u32_u16 v17, v17, 16, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v14, v16, 0x1240, v17 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 0, 2 +v_mad_u32_u16 v14, v16, 0x90, v14 op_sel:[0,0,0,0] +v_bfe_u32 v17, v20, 4, 2 +v_mad_u32_u16 v14, v17, 32, v14 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 6, 1 +v_mad_u32_u16 v14, v16, 0x480, v14 op_sel:[0,0,0,0] +v_bfe_u32 v16, v20, 7, 1 +v_mad_u32_u16 v14, v16, 0x900, v14 op_sel:[0,0,0,0] +v_bfe_u32 v18, v20, 1, 2 +v_mad_u32_u16 v13, v18, 32, 0 op_sel:[0,0,0,0] +v_bfe_u32 v19, v20, 3, 1 +v_mad_u32_u16 v13, v19, 0x480, v13 op_sel:[0,0,0,0] +v_add_nc_u32 v18, v21, 32 +v_bfi_b32 v18, 0xbf, v20, v18 +v_bfe_u32 v18, v18, 6, 2 +v_mad_u32_u16 v13, v18, 0x90, v13 op_sel:[0,0,0,0] +v_xor_b32 v16, v0, v0 quad_perm:[2,3,2,1] +v_xor_b32 v17, v0, v0 quad_perm:[0,0,3,3] +v_sub_nc_u16 v16, v16, v17 op_sel:[0,0,0] +v_cvt_f16_i16 v15, v16 +_v_cvt_f16_i16__vop1 (15 | /*op_sel*/ 0x80), 17 +_v_pk_mul_f16__vop3p 15, 271, 240, 0x0, 0x1, 0x0, 0x0 +v_bfe_u32 v16, v0, 6, 1 +v_and_b32 v5, 63, v0 +v_cmp_eq_u32 vcc, v16, 1 +v_cndmask_b32 v16, 0, 0x400, vcc +v_cndmask_b32 v17, 0, 0x100, vcc +v_lshl_add_u32 v6, v5, 2, 0 +v_lshl_add_u32 v5, v5, 4, v16 +s_mov_b32 s23, 2 +s_mov_b32 s34, 0 +s_mov_b32 s40, 0xbc00c000 +v_readfirstlane_b32 s82, v0 +s_and_b32 null, 64, s82 +s_cmov_b32 s40, 0x3c00c000 +s_lshl_b32 s49, s43, 1 +s_lshl_b32 s53, s47, 1 +s_lshl_b32 s83, s49, 4 +s_lshl_b32 s84, s53, 4 +s_and_b32 null, 0x80, s82 +s_cselect_b32 s83, s83, 0 +s_cselect_b32 s84, s84, 0 +s_cselect_b32 s22, 16, 0 +s_sub_u32 s22, s9, s22 +s_cmov_b32 s22, 0 +s_mov_b32 s35, 0x11014000 +s_bitcmp1_b32 s14, 4 +s_cselect_b32 s85, 0, 0x8000000 +s_and_b32 s35, 0xf7ffffff, s35 +s_or_b32 s35, s35, s85 +s_and_b32 s17, s17, 0xffff +s_add_u32 s17, s17, 0x20000 +s_and_b32 s19, s19, 0xffff +s_add_u32 s19, s19, 0x20000 +s_add_u32 s16, s16, s83 +s_addc_u32 s17, s17, 0 +s_add_u32 s18, s18, s84 +s_addc_u32 s19, s19, 0 +s_mov_b64 s[36:37], s[16:17] +s_mov_b32 s38, 0x80000000 +s_mov_b32 s39, 0 +s_getpc_b64 s[64:65] +v_cmp_lt_u32 vcc, v0, 0x80 +s_cmp_gt_u32 vcc_lo, 0 +s_mov_b32 s82, 0x2ed0 +s_mov_b32 s86, 0x1e40 +s_cmov_b32 s82, 0x2608 +s_cmov_b32 s86, 0x1678 +s_mov_b32 s83, 0x30b4 +s_mov_b32 s87, 0x2024 +s_cmov_b32 s83, 0x27ec +s_cmov_b32 s87, 0x185c +s_mov_b32 s84, 0x3364 +s_mov_b32 s88, 0x2254 +s_cmov_b32 s84, 0x2a9c +s_cmov_b32 s88, 0x1a8c +s_mov_b32 s85, 0x3548 +s_mov_b32 s89, 0x2438 +s_cmov_b32 s85, 0x2c80 +s_cmov_b32 s89, 0x1c70 +s_add_u32 s66, s64, s82 +s_addc_u32 s67, s65, 0 +s_add_u32 s74, s64, s86 +s_addc_u32 s75, s65, 0 +s_add_u32 s68, s64, s83 +s_addc_u32 s69, s65, 0 +s_add_u32 s76, s64, s87 +s_addc_u32 s77, s65, 0 +s_add_u32 s70, s64, s84 +s_addc_u32 s71, s65, 0 +s_add_u32 s78, s64, s88 +s_addc_u32 s79, s65, 0 +s_add_u32 s72, s64, s85 +s_addc_u32 s73, s65, 0 +s_add_u32 s80, s64, s89 +s_addc_u32 s81, s65, 0 +s_mov_b32 s45, 0 +v_mov_b32 v4, 0 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[66:67], s[74:75] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x2b4 +s_setprio 2 +v_pk_fma_f16 v224, v34, s40, v228 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v225, v102, s40, v229 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v226, v170, s40, v230 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v227, v206, s40, v231 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 228, 290, 291, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 229, 358, 359, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 230, 426, 427, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 231, 462, 463, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v102, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v103, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v102, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v103, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6818 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[66:67], s[74:75] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +_v_pk_mul_f16__vop3p 224, 290, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 225, 358, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 226, 426, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 227, 462, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v34, v224 quad_perm:[1,0,3,2] +v_mov_b32 v102, v225 quad_perm:[1,0,3,2] +v_mov_b32 v170, v226 quad_perm:[1,0,3,2] +v_mov_b32 v206, v227 quad_perm:[1,0,3,2] +v_pk_fma_f16 v224, v34, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v102, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v170, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v206, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v34, v224 quad_perm:[2,3,0,1] +v_mov_b32 v102, v225 quad_perm:[2,3,0,1] +v_mov_b32 v170, v226 quad_perm:[2,3,0,1] +v_mov_b32 v206, v227 quad_perm:[2,3,0,1] +v_pk_fma_f16 v224, v34, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v102, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v170, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v206, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v34, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v102, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v103, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v102, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v103, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6720 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[68:69], s[76:77] +s_bitcmp1_b32 s45, 0 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v232, v208, s40, v236 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v233, v212, s40, v237 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v234, v216, s40, v238 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v235, v220, s40, v239 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 236, 464, 465, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 237, 468, 469, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 238, 472, 473, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 239, 476, 477, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v208, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v209, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v208, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v209, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v212, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v213, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v212, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v213, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6646 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[68:69], s[76:77] +s_bitcmp1_b32 s45, 0 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 232, 464, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 468, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 472, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 235, 476, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v208, v232 quad_perm:[1,0,3,2] +v_mov_b32 v212, v233 quad_perm:[1,0,3,2] +v_mov_b32 v216, v234 quad_perm:[1,0,3,2] +v_mov_b32 v220, v235 quad_perm:[1,0,3,2] +v_pk_fma_f16 v232, v208, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v212, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v216, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v220, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v208, v232 quad_perm:[2,3,0,1] +v_mov_b32 v212, v233 quad_perm:[2,3,0,1] +v_mov_b32 v216, v234 quad_perm:[2,3,0,1] +v_mov_b32 v220, v235 quad_perm:[2,3,0,1] +v_pk_fma_f16 v232, v208, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v212, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v216, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v220, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v208, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v209, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v208, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v209, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v212, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v213, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v212, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v213, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6548 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[70:71], s[78:79] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x2bc +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_add_f16__vop3p 224, 272, 273, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 225, 308, 357, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 226, 291, 290, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 227, 359, 358, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v228, v16, s40, v169 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v229, v52, s40, v205 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v230, v35, s40, v170 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v231, v103, s40, v206 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v102, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v103, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v102, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v103, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6473 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[70:71], s[78:79] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_mul_f16__vop3p 224, 273, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 225, 357, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 226, 290, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 227, 358, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v17, v224 quad_perm:[1,0,3,2] +v_mov_b32 v101, v225 quad_perm:[1,0,3,2] +v_mov_b32 v34, v226 quad_perm:[1,0,3,2] +v_mov_b32 v102, v227 quad_perm:[1,0,3,2] +v_pk_fma_f16 v224, v17, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v101, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v34, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v102, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v17, v224 quad_perm:[2,3,0,1] +v_mov_b32 v101, v225 quad_perm:[2,3,0,1] +v_mov_b32 v34, v226 quad_perm:[2,3,0,1] +v_mov_b32 v102, v227 quad_perm:[2,3,0,1] +v_pk_fma_f16 v224, v17, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v101, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v34, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v102, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v34, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v35, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v102, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v103, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v102, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v103, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6374 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[72:73], s[80:81] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_add_f16__vop3p 232, 467, 466, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 233, 471, 470, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 234, 465, 464, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 235, 469, 468, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v236, v211, s40, v218 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v237, v215, s40, v222 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v238, v209, s40, v216 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v239, v213, s40, v220 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v208, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v209, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v208, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v209, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v212, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v213, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v212, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v213, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6299 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[72:73], s[80:81] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_mul_f16__vop3p 232, 466, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 470, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 464, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 235, 468, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v210, v232 quad_perm:[1,0,3,2] +v_mov_b32 v214, v233 quad_perm:[1,0,3,2] +v_mov_b32 v208, v234 quad_perm:[1,0,3,2] +v_mov_b32 v212, v235 quad_perm:[1,0,3,2] +v_pk_fma_f16 v232, v210, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v214, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v208, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v212, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v210, v232 quad_perm:[2,3,0,1] +v_mov_b32 v214, v233 quad_perm:[2,3,0,1] +v_mov_b32 v208, v234 quad_perm:[2,3,0,1] +v_mov_b32 v212, v235 quad_perm:[2,3,0,1] +v_pk_fma_f16 v232, v210, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v214, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v208, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v212, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v208, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v209, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v208, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v209, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v212, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v213, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v212, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v213, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6200 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[66:67], s[74:75] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x2b4 +s_setprio 2 +v_pk_fma_f16 v224, v169, s40, v228 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v225, v205, s40, v229 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v226, v170, s40, v230 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v227, v206, s40, v231 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 228, 425, 392, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 229, 461, 460, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 230, 426, 427, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 231, 462, 463, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v169, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v136, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v169, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v136, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v205, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v204, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v205, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v204, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6126 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[66:67], s[74:75] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x12c +s_setprio 2 +_v_pk_mul_f16__vop3p 224, 425, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 225, 461, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 226, 426, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 227, 462, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v169, v224 quad_perm:[1,0,3,2] +v_mov_b32 v205, v225 quad_perm:[1,0,3,2] +v_mov_b32 v170, v226 quad_perm:[1,0,3,2] +v_mov_b32 v206, v227 quad_perm:[1,0,3,2] +v_pk_fma_f16 v224, v169, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v205, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v170, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v206, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v169, v224 quad_perm:[2,3,0,1] +v_mov_b32 v205, v225 quad_perm:[2,3,0,1] +v_mov_b32 v170, v226 quad_perm:[2,3,0,1] +v_mov_b32 v206, v227 quad_perm:[2,3,0,1] +v_pk_fma_f16 v224, v169, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v205, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v170, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v206, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v169, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v136, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v169, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v136, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v170, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v171, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v205, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v204, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v205, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v204, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v206, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v207, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 6028 +s_mov_b32 s56, 0x18c +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[68:69], s[76:77] +s_bitcmp1_b32 s45, 0 +s_cselect_b32 s56, s56, 0x2b8 +s_setprio 2 +v_pk_fma_f16 v232, v218, s40, v236 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v233, v222, s40, v237 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v234, v216, s40, v238 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v235, v220, s40, v239 op_sel:[0,0,0] op_sel_hi:[1,0,1] +_v_pk_add_f16__vop3p 236, 474, 475, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 237, 478, 479, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 238, 472, 473, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 239, 476, 477, 0x0, 0x3, 0x1, 0x1 +buffer_load_d16_b16 v218, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v219, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v218, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v219, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v222, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v223, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v222, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v223, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5954 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[64:65], s[68:69], s[76:77] +s_bitcmp1_b32 s45, 0 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +_v_pk_mul_f16__vop3p 232, 474, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 478, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 472, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 235, 476, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v218, v232 quad_perm:[1,0,3,2] +v_mov_b32 v222, v233 quad_perm:[1,0,3,2] +v_mov_b32 v216, v234 quad_perm:[1,0,3,2] +v_mov_b32 v220, v235 quad_perm:[1,0,3,2] +v_pk_fma_f16 v232, v218, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v222, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v216, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v220, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v218, v232 quad_perm:[2,3,0,1] +v_mov_b32 v222, v233 quad_perm:[2,3,0,1] +v_mov_b32 v216, v234 quad_perm:[2,3,0,1] +v_mov_b32 v220, v235 quad_perm:[2,3,0,1] +v_pk_fma_f16 v232, v218, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v222, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v216, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v220, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v218, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v219, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v218, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v219, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v216, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v217, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v222, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v223, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v222, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v223, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v220, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v221, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5856 +s_mov_b32 s56, 0x190 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[70:71], s[78:79] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x2bc +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_add_f16__vop3p 224, 272, 273, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 225, 308, 357, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 226, 392, 425, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 227, 460, 461, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v228, v16, s40, v34 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v229, v52, s40, v102 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v230, v136, s40, v170 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v231, v204, s40, v206 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v169, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v136, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v169, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v136, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v205, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v204, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v205, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v204, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5781 +_s_mov_b32__sop1_lit 56, 0x4 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[70:71], s[78:79] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0x130 +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_mul_f16__vop3p 224, 273, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 225, 357, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 226, 425, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 227, 461, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v17, v224 quad_perm:[1,0,3,2] +v_mov_b32 v101, v225 quad_perm:[1,0,3,2] +v_mov_b32 v169, v226 quad_perm:[1,0,3,2] +v_mov_b32 v205, v227 quad_perm:[1,0,3,2] +v_pk_fma_f16 v224, v17, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v101, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v169, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v205, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v17, v224 quad_perm:[2,3,0,1] +v_mov_b32 v101, v225 quad_perm:[2,3,0,1] +v_mov_b32 v169, v226 quad_perm:[2,3,0,1] +v_mov_b32 v205, v227 quad_perm:[2,3,0,1] +v_pk_fma_f16 v224, v17, v15, v224 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v225, v101, v15, v225 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v226, v169, v15, v226 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v227, v205, v15, v227 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v169, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v136, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v17, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v16, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v169, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v136, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v205, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v204, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v101, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v52, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v205, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v204, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5682 +s_mov_b32 s56, 0xffffebf0 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[72:73], s[80:81] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0xffffed18 +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_add_f16__vop3p 232, 467, 466, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 233, 471, 470, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 234, 475, 474, 0x0, 0x3, 0x1, 0x1 +_v_pk_add_f16__vop3p 235, 479, 478, 0x0, 0x3, 0x1, 0x1 +v_pk_fma_f16 v236, v211, s40, v208 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v237, v215, s40, v212 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v238, v219, s40, v216 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v239, v223, s40, v220 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v218, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v219, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v218, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v219, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v222, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v223, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v222, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v223, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5607 +s_mov_b32 s56, 0xffffea64 +s_bitcmp1_b32 s45, 2 +s_cselect_b64 s[64:65], s[72:73], s[80:81] +s_bitcmp1_b32 s45, 1 +s_cselect_b32 s56, s56, 0xffffeb8c +s_setprio 2 +s_waitcnt vmcnt(16) +_v_pk_mul_f16__vop3p 232, 466, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 470, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 474, 271, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 235, 478, 271, 0x0, 0x1, 0x0, 0x0 +v_mov_b32 v210, v232 quad_perm:[1,0,3,2] +v_mov_b32 v214, v233 quad_perm:[1,0,3,2] +v_mov_b32 v218, v234 quad_perm:[1,0,3,2] +v_mov_b32 v222, v235 quad_perm:[1,0,3,2] +v_pk_fma_f16 v232, v210, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v214, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v218, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v222, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v210, v232 quad_perm:[2,3,0,1] +v_mov_b32 v214, v233 quad_perm:[2,3,0,1] +v_mov_b32 v218, v234 quad_perm:[2,3,0,1] +v_mov_b32 v222, v235 quad_perm:[2,3,0,1] +v_pk_fma_f16 v232, v210, v15, v232 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v233, v214, v15, v233 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v234, v218, v15, v234 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v235, v222, v15, v235 op_sel:[0,1,0] op_sel_hi:[1,1,1] +buffer_load_d16_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v218, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v219, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v210, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v211, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v218, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v219, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_b16 v222, v8, s[36:39], 0 idxen +buffer_load_d16_b16 v223, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +buffer_load_d16_hi_b16 v214, v7, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v215, v9, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v222, v8, s[36:39], 0 idxen +buffer_load_d16_hi_b16 v223, v10, s[36:39], 0 idxen +s_sub_u32 s15, s15, 1 +s_cselect_b32 s39, 0, s39 +s_add_u32 s36, s36, s59 +s_addc_u32 s37, s37, 0 +s_swappc_b64 s[64:65], s[64:65] +s_branch 5508 +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v69, v36 +v_mov_b32 v70, v37 +v_mov_b32 v71, v38 +v_mov_b32 v72, v39 +v_mov_b32 v73, v40 +v_mov_b32 v74, v41 +v_mov_b32 v75, v42 +v_mov_b32 v76, v43 +_v_pk_add_f16__vop3p 104, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 105, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 106, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 107, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 108, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 109, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 110, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 111, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 104, 360, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 105, 361, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 106, 362, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 107, 363, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 108, 364, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 109, 365, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 110, 366, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 111, 367, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v104, v44, 0.5, v104 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v105, v45, 0.5, v105 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v106, v46, 0.5, v106 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v107, v47, 0.5, v107 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v108, v48, 0.5, v108 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v109, v49, 0.5, v109 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v110, v50, 0.5, v110 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v111, v51, 0.5, v111 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v137, v44, -1.0, v104 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v138, v45, -1.0, v105 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v139, v46, -1.0, v106 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v140, v47, -1.0, v107 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v141, v48, -1.0, v108 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v142, v49, -1.0, v109 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v143, v50, -1.0, v110 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v144, v51, -1.0, v111 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v172, v61 +v_mov_b32 v173, v62 +v_mov_b32 v174, v63 +v_mov_b32 v175, v64 +v_mov_b32 v176, v65 +v_mov_b32 v177, v66 +v_mov_b32 v178, v67 +v_mov_b32 v179, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v12, v14, v3, s[54:55] +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s41, v4 +v_mov_b32 v85, v36 +v_mov_b32 v86, v37 +v_mov_b32 v87, v38 +v_mov_b32 v88, v39 +v_mov_b32 v89, v40 +v_mov_b32 v90, v41 +v_mov_b32 v91, v42 +v_mov_b32 v92, v43 +_v_pk_add_f16__vop3p 120, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 121, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 122, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 123, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 124, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 125, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 126, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 127, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 120, 376, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 121, 377, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 122, 378, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 123, 379, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 124, 380, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 125, 381, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 126, 382, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 127, 383, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v120, v44, 0.5, v120 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v121, v45, 0.5, v121 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v122, v46, 0.5, v122 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v123, v47, 0.5, v123 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v124, v48, 0.5, v124 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v125, v49, 0.5, v125 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v126, v50, 0.5, v126 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v127, v51, 0.5, v127 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v153, v44, -1.0, v120 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v154, v45, -1.0, v121 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v155, v46, -1.0, v122 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v156, v47, -1.0, v123 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v157, v48, -1.0, v124 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v158, v49, -1.0, v125 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v159, v50, -1.0, v126 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v160, v51, -1.0, v127 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v188, v61 +v_mov_b32 v189, v62 +v_mov_b32 v190, v63 +v_mov_b32 v191, v64 +v_mov_b32 v192, v65 +v_mov_b32 v193, v66 +v_mov_b32 v194, v67 +v_mov_b32 v195, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v1, vcc +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v77, v36 +v_mov_b32 v78, v37 +v_mov_b32 v79, v38 +v_mov_b32 v80, v39 +v_mov_b32 v81, v40 +v_mov_b32 v82, v41 +v_mov_b32 v83, v42 +v_mov_b32 v84, v43 +_v_pk_add_f16__vop3p 112, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 113, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 114, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 115, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 116, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 117, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 118, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 119, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 112, 368, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 113, 369, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 114, 370, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 115, 371, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 116, 372, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 117, 373, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 118, 374, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 119, 375, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v112, v44, 0.5, v112 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v113, v45, 0.5, v113 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v114, v46, 0.5, v114 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v115, v47, 0.5, v115 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v116, v48, 0.5, v116 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v117, v49, 0.5, v117 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v118, v50, 0.5, v118 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v119, v51, 0.5, v119 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v145, v44, -1.0, v112 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v146, v45, -1.0, v113 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v147, v46, -1.0, v114 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v148, v47, -1.0, v115 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v149, v48, -1.0, v116 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v150, v49, -1.0, v117 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v151, v50, -1.0, v118 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v152, v51, -1.0, v119 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v180, v61 +v_mov_b32 v181, v62 +v_mov_b32 v182, v63 +v_mov_b32 v183, v64 +v_mov_b32 v184, v65 +v_mov_b32 v185, v66 +v_mov_b32 v186, v67 +v_mov_b32 v187, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v12, v14, v3, s[54:55] +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v93, v36 +v_mov_b32 v94, v37 +v_mov_b32 v95, v38 +v_mov_b32 v96, v39 +v_mov_b32 v97, v40 +v_mov_b32 v98, v41 +v_mov_b32 v99, v42 +v_mov_b32 v100, v43 +_v_pk_add_f16__vop3p 128, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 129, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 130, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 131, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 132, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 133, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 134, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 135, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 128, 384, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 129, 385, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 130, 386, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 131, 387, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 132, 388, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 133, 389, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 134, 390, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 135, 391, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v128, v44, 0.5, v128 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v129, v45, 0.5, v129 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v130, v46, 0.5, v130 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v131, v47, 0.5, v131 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v132, v48, 0.5, v132 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v133, v49, 0.5, v133 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v134, v50, 0.5, v134 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v135, v51, 0.5, v135 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v44, -1.0, v128 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v45, -1.0, v129 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v46, -1.0, v130 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v164, v47, -1.0, v131 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v165, v48, -1.0, v132 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v166, v49, -1.0, v133 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v167, v50, -1.0, v134 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v168, v51, -1.0, v135 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v196, v61 +v_mov_b32 v197, v62 +v_mov_b32 v198, v63 +v_mov_b32 v199, v64 +v_mov_b32 v200, v65 +v_mov_b32 v201, v66 +v_mov_b32 v202, v67 +v_mov_b32 v203, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v2, vcc +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_mov_b32 v69, v36 +v_mov_b32 v70, v37 +v_mov_b32 v71, v38 +v_mov_b32 v72, v39 +v_mov_b32 v73, v40 +v_mov_b32 v74, v41 +v_mov_b32 v75, v42 +v_mov_b32 v76, v43 +_v_pk_add_f16__vop3p 104, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 105, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 106, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 107, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 108, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 109, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 110, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 111, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 104, 360, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 105, 361, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 106, 362, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 107, 363, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 108, 364, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 109, 365, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 110, 366, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 111, 367, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v104, v44, 0.5, v104 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v105, v45, 0.5, v105 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v106, v46, 0.5, v106 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v107, v47, 0.5, v107 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v108, v48, 0.5, v108 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v109, v49, 0.5, v109 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v110, v50, 0.5, v110 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v111, v51, 0.5, v111 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v137, v44, -1.0, v104 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v138, v45, -1.0, v105 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v139, v46, -1.0, v106 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v140, v47, -1.0, v107 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v141, v48, -1.0, v108 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v142, v49, -1.0, v109 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v143, v50, -1.0, v110 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v144, v51, -1.0, v111 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v172, v61 +v_mov_b32 v173, v62 +v_mov_b32 v174, v63 +v_mov_b32 v175, v64 +v_mov_b32 v176, v65 +v_mov_b32 v177, v66 +v_mov_b32 v178, v67 +v_mov_b32 v179, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s41, v4 +v_mov_b32 v85, v36 +v_mov_b32 v86, v37 +v_mov_b32 v87, v38 +v_mov_b32 v88, v39 +v_mov_b32 v89, v40 +v_mov_b32 v90, v41 +v_mov_b32 v91, v42 +v_mov_b32 v92, v43 +_v_pk_add_f16__vop3p 120, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 121, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 122, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 123, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 124, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 125, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 126, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 127, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 120, 376, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 121, 377, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 122, 378, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 123, 379, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 124, 380, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 125, 381, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 126, 382, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 127, 383, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v120, v44, 0.5, v120 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v121, v45, 0.5, v121 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v122, v46, 0.5, v122 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v123, v47, 0.5, v123 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v124, v48, 0.5, v124 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v125, v49, 0.5, v125 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v126, v50, 0.5, v126 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v127, v51, 0.5, v127 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v153, v44, -1.0, v120 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v154, v45, -1.0, v121 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v155, v46, -1.0, v122 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v156, v47, -1.0, v123 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v157, v48, -1.0, v124 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v158, v49, -1.0, v125 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v159, v50, -1.0, v126 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v160, v51, -1.0, v127 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v188, v61 +v_mov_b32 v189, v62 +v_mov_b32 v190, v63 +v_mov_b32 v191, v64 +v_mov_b32 v192, v65 +v_mov_b32 v193, v66 +v_mov_b32 v194, v67 +v_mov_b32 v195, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v1, vcc +s_barrier +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_mov_b32 v77, v36 +v_mov_b32 v78, v37 +v_mov_b32 v79, v38 +v_mov_b32 v80, v39 +v_mov_b32 v81, v40 +v_mov_b32 v82, v41 +v_mov_b32 v83, v42 +v_mov_b32 v84, v43 +_v_pk_add_f16__vop3p 112, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 113, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 114, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 115, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 116, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 117, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 118, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 119, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 112, 368, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 113, 369, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 114, 370, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 115, 371, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 116, 372, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 117, 373, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 118, 374, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 119, 375, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v112, v44, 0.5, v112 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v113, v45, 0.5, v113 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v114, v46, 0.5, v114 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v115, v47, 0.5, v115 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v116, v48, 0.5, v116 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v117, v49, 0.5, v117 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v118, v50, 0.5, v118 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v119, v51, 0.5, v119 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v145, v44, -1.0, v112 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v146, v45, -1.0, v113 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v147, v46, -1.0, v114 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v148, v47, -1.0, v115 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v149, v48, -1.0, v116 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v150, v49, -1.0, v117 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v151, v50, -1.0, v118 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v152, v51, -1.0, v119 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v180, v61 +v_mov_b32 v181, v62 +v_mov_b32 v182, v63 +v_mov_b32 v183, v64 +v_mov_b32 v184, v65 +v_mov_b32 v185, v66 +v_mov_b32 v186, v67 +v_mov_b32 v187, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_mov_b32 v93, v36 +v_mov_b32 v94, v37 +v_mov_b32 v95, v38 +v_mov_b32 v96, v39 +v_mov_b32 v97, v40 +v_mov_b32 v98, v41 +v_mov_b32 v99, v42 +v_mov_b32 v100, v43 +_v_pk_add_f16__vop3p 128, 292, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 129, 293, 318, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 130, 294, 319, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 131, 295, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 132, 296, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 133, 297, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 134, 298, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 135, 299, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 128, 384, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 129, 385, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 130, 386, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 131, 387, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 132, 388, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 133, 389, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 134, 390, 240, 0x0, 0x1, 0x0, 0x0 +_v_pk_mul_f16__vop3p 135, 391, 240, 0x0, 0x1, 0x0, 0x0 +v_pk_fma_f16 v128, v44, 0.5, v128 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v129, v45, 0.5, v129 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v130, v46, 0.5, v130 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v131, v47, 0.5, v131 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v132, v48, 0.5, v132 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v133, v49, 0.5, v133 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v134, v50, 0.5, v134 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v135, v51, 0.5, v135 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v161, v44, -1.0, v128 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v162, v45, -1.0, v129 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v163, v46, -1.0, v130 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v164, v47, -1.0, v131 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v165, v48, -1.0, v132 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v166, v49, -1.0, v133 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v167, v50, -1.0, v134 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_pk_fma_f16 v168, v51, -1.0, v135 op_sel:[0,0,0] op_sel_hi:[1,0,1] +v_mov_b32 v196, v61 +v_mov_b32 v197, v62 +v_mov_b32 v198, v63 +v_mov_b32 v199, v64 +v_mov_b32 v200, v65 +v_mov_b32 v201, v66 +v_mov_b32 v202, v67 +v_mov_b32 v203, v68 +s_mov_b32 exec_hi, -1 +v_cndmask_b32 v11, v13, v2, vcc +s_barrier +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[172:179], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[180:187], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[104:111], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[112:119], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[137:144], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[145:152], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_cndmask_b32 v12, v14, v3, s[54:55] +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s41, v4 +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[85:92], v[36:43], v[18:21] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[93:100], v[36:43], v[18:21] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[188:195], v[61:68], v[30:33] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[196:203], v[61:68], v[30:33] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[120:127], v[36:43], v[22:25] op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[128:135], v[36:43], v[22:25] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[153:160], v[61:68], v[26:29] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[161:168], v[61:68], v[26:29] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v1, vcc +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[172:179], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[180:187], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[104:111], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[112:119], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[137:144], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[145:152], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_cndmask_b32 v12, v14, v3, s[54:55] +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +s_barrier +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[85:92], v[36:43], v[18:21] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[93:100], v[36:43], v[18:21] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[188:195], v[61:68], v[30:33] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[196:203], v[61:68], v[30:33] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[120:127], v[36:43], v[22:25] op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[128:135], v[36:43], v[22:25] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[153:160], v[61:68], v[26:29] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[161:168], v[61:68], v[26:29] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v2, vcc +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v1, v[18:21] offset:4672 +ds_store_b128 v1, v[30:33] offset:16 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[172:179], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[180:187], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[104:111], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[112:119], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[137:144], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[145:152], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_sub_u32 s23, s23, s34 +s_cselect_b64 s[56:57], 0, s[56:57] +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s41, v4 +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[85:92], v[36:43], v[18:21] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[93:100], v[36:43], v[18:21] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[188:195], v[61:68], v[30:33] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[196:203], v[61:68], v[30:33] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[120:127], v[36:43], v[22:25] op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[128:135], v[36:43], v[22:25] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[153:160], v[61:68], v[26:29] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[161:168], v[61:68], v[26:29] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v1, vcc +s_barrier +s_bitcmp1_b32 s41, 1 +s_addc_u32 s45, s45, s45 +s_bitcmp1_b32 s41, 0 +s_cselect_b32 s35, 0, s35 +s_cselect_b32 s34, 1, s34 +s_lshr_b32 s39, s41, 16 +ds_load_b128 v[7:10], v5 offset:37120 +ds_load_b32 v4, v6 offset:39168 +s_bitcmp1_b32 s41, 1 +s_cselect_b32 s59, s49, s53 +s_cselect_b64 s[36:37], s[16:17], s[18:19] +s_mul_i32 s56, s39, s59 +s_mul_hi_u32 s57, s39, s59 +s_add_u32 s15, s39, 1 +s_sub_u32 s15, s22, s15 +s_cselect_b32 s39, 0, s35 +s_add_u32 s36, s36, s56 +s_addc_u32 s37, s37, s57 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_waitcnt lgkmcnt(10) +s_swappc_b64 s[64:65], s[64:65] +ds_store_b128 v2, v[18:21] offset:13952 +ds_store_b128 v2, v[30:33] offset:9296 +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 s[54:55], -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[69:76], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[77:84], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[172:179], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[180:187], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[104:111], v[36:43], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[112:119], v[36:43], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[137:144], v[61:68], 0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[145:152], v[61:68], 0 op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_cndmask_b32 v12, v14, v3, s[54:55] +s_barrier +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:27840 +ds_load_b128 v[40:43], v11 offset:30144 +ds_load_b128 v[44:47], v11 offset:32512 +ds_load_b128 v[48:51], v11 offset:34816 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[224:227] offset:18560 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:27856 +ds_load_b128 v[57:60], v11 offset:30160 +ds_load_b128 v[61:64], v11 offset:32528 +ds_load_b128 v[65:68], v11 offset:34832 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[228:231] offset:19136 +s_swappc_b64 s[64:65], s[64:65] +s_setprio 1 +s_ashr_i32 s57, s56, 31 +s_add_u32 s64, s64, s56 +s_addc_u32 s65, s65, s57 +s_bitcmp1_b32 s45, 1 +s_cselect_b64 vcc, -1, 0 +s_mov_b32 exec_hi, 0 +s_waitcnt lgkmcnt(0) +_v_pk_add_f16__vop3p 36, 292, 309, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 293, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 38, 294, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 39, 295, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 40, 296, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 41, 297, 314, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 42, 298, 315, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 43, 299, 316, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 61, 317, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 318, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 319, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 320, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 321, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 322, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 323, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 324, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[18:21], v[85:92], v[36:43], v[18:21] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[18:21], v[93:100], v[36:43], v[18:21] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[30:33], v[188:195], v[61:68], v[30:33] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[30:33], v[196:203], v[61:68], v[30:33] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 36, 300, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 37, 301, 310, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 302, 311, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 39, 303, 312, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 40, 304, 313, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 41, 305, 314, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 42, 306, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 43, 307, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 61, 309, 300, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 62, 310, 301, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 63, 311, 302, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 64, 312, 303, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 65, 313, 304, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 66, 314, 305, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 67, 315, 306, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 68, 316, 307, 0x0, 0x3, 0x2, 0x2 +v_wmma_f16_16x16x16_f16 v[22:25], v[120:127], v[36:43], v[22:25] op_sel:[0,0,0] op_sel_hi:[1,1,0] +s_mov_b32 exec_hi, -1 +v_wmma_f16_16x16x16_f16 v[22:25], v[128:135], v[36:43], v[22:25] op_sel:[0,0,1] op_sel_hi:[1,1,1] +v_wmma_f16_16x16x16_f16 v[26:29], v[153:160], v[61:68], v[26:29] op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_wmma_f16_16x16x16_f16 v[26:29], v[161:168], v[61:68], v[26:29] op_sel:[0,0,1] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 18, 274, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 280, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 281, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 278, 286, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 31, 279, 287, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 32, 280, 288, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 33, 281, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 18, 274, 282, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 19, 275, 283, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 20, 276, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 21, 277, 285, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 30, 286, 282, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 31, 287, 283, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 32, 288, 284, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 33, 289, 285, 0x0, 0x3, 0x2, 0x2 +v_cndmask_b32 v11, v13, v2, vcc +s_barrier +s_mov_b32 exec_hi, 0 +ds_load_b128 v[36:39], v11 offset:18560 +ds_load_b128 v[40:43], v11 offset:20864 +ds_load_b128 v[44:47], v11 offset:23232 +ds_load_b128 v[48:51], v11 offset:25536 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[232:235] offset:27840 +s_mov_b32 exec_hi, 0 +ds_load_b128 v[53:56], v11 offset:18576 +ds_load_b128 v[57:60], v11 offset:20880 +ds_load_b128 v[61:64], v11 offset:23248 +ds_load_b128 v[65:68], v11 offset:25552 +s_mov_b32 exec_hi, -1 +ds_store_b128 v12, v[236:239] offset:28416 +s_swappc_b64 s[64:65], s[64:65] +v_bfe_u32 v21, v0, 6, 1 +v_and_b32 v16, 63, v0 +v_cmp_eq_u32 vcc, v21, 1 +v_cndmask_b32 v23, 0, 0x400, vcc +v_cndmask_b32 v21, 0, 0x400, vcc +v_cndmask_b32 v22, 0, 0x100, vcc +v_lshl_add_u32 v14, v16, 3, v23 +v_lshl_add_u32 v17, v16, 2, v22 +v_lshl_add_u32 v18, v16, 2, 0 +v_lshl_add_u32 v16, v16, 4, v21 +s_cmp_eq_u64 s[30:31], 0 +s_cselect_b32 s91, 0, 0x11014000 +s_and_b32 s31, s31, 0xffff +s_add_u32 s31, s31, 0x20000 +s_mov_b64 s[88:89], s[30:31] +s_mov_b32 s90, 0x80000000 +v_and_b32 v21, v0, 63 +v_lshlrev_b32 v21, 1, v21 +v_cmp_lt_u32 vcc, v21, s12 +v_add_nc_u32 v22, v21, 1 +v_cndmask_b32 v21, 0x80000000, v21, vcc +v_cmp_lt_u32 vcc, v22, s12 +v_cndmask_b32 v22, 0x80000000, v22, vcc +buffer_load_d16_b16 v23, v21, s[88:91], 0 idxen +buffer_load_d16_hi_b16 v23, v22, s[88:91], 0 idxen +s_waitcnt vmcnt(0) +v_readlane_b32 s56, v23, 0 +v_readlane_b32 s57, v23, 1 +v_readlane_b32 s59, v23, 2 +v_readlane_b32 s64, v23, 3 +v_readlane_b32 s65, v23, 4 +v_readlane_b32 s66, v23, 5 +v_readlane_b32 s67, v23, 6 +v_readlane_b32 s68, v23, 7 +s_bfe_u32 s88, s58, 0x80000 +s_cmp_eq_u32 s88, 2 +s_cbranch_scc1 20 +s_cmp_eq_u32 s88, 0 +s_cselect_b32 s32, 1.0, s32 +v_cvt_f16_f32 v21, s32 +v_readfirstlane_b32 s32, v21 +v_cvt_f16_f32 v21, s33 +v_readfirstlane_b32 s33, v21 +_v_cmp_gt_f16__vop3_s_lit 106, 32, 0x3c00, 0x0, 0x0 +s_pack_ll_b32_b16 s32, s32, s32 +s_pack_ll_b32_b16 s33, s33, s33 +s_cmp_eq_u32 s88, 3 +s_cbranch_scc1 10 +s_cbranch_vccnz 3 +s_mov_b32 s84, 0x564c +s_branch 8 +s_mov_b32 s84, 0x59e4 +s_branch 5 +s_mov_b32 s84, 0x5d7c +s_branch 2 +s_mov_b32 s84, 0x6394 +s_add_u32 s86, s6, 0x4b18 +s_addc_u32 s87, s7, 0 +s_mov_b32 s82, 0xbc00c000 +s_mov_b32 s40, 0x10000 +s_mov_b32 s41, 0x30002 +s_mov_b32 s45, 0x10000 +v_readfirstlane_b32 s88, v0 +s_and_b32 null, 64, s88 +s_cmov_b32 s82, 0x3c00c000 +s_cmov_b32 s40, 0x20003 +s_cmov_b32 s41, 1 +s_cmov_b32 s45, 1 +s_and_b32 s21, s21, 0xffff +s_add_u32 s21, s21, 0x20000 +s_lshl_b32 s80, s51, 1 +s_lshl_b32 s81, s52, 1 +s_mov_b64 s[72:73], s[20:21] +s_mov_b32 s74, 0x80000000 +s_mov_b32 s75, 0 +s_sub_u32 s89, s25, 1 +s_bitcmp1_b32 s14, 1 +s_cselect_b32 s89, s89, 0 +s_cselect_b32 s88, -1, 1 +s_sub_u32 s91, s24, 1 +s_bitcmp1_b32 s14, 0 +s_cselect_b32 s91, s91, 0 +s_cselect_b32 s90, -1, 1 +v_bfe_u32 v24, v0, 6, 1 +v_bfe_u32 v25, v0, 4, 1 +v_bfe_u32 v21, v0, 5, 1 +v_lshl_add_u32 v24, v24, 2, 0 +v_lshl_add_u32 v25, v25, 3, v24 +v_bfe_u32 v23, v0, 2, 2 +v_bfe_u32 v24, v0, 3, 1 +v_xor_b32 v22, v0, v0 quad_perm:[0,0,3,1] +v_lshl_add_u32 v21, v21, 1, v25 +v_xor_b32 v23, v23, v24 +v_add_nc_u32 v24, v21, 1 +v_mad_i32_i16 v19, v23, s88, s89 op_sel:[0,0,0,0] +v_mad_i32_i16 v25, v22, s90, s91 op_sel:[0,0,0,0] +v_mad_u32_u16 v19, v25, s48, v19 op_sel:[0,0,0,0] +v_cmp_lt_u32 vcc, v23, s25 +v_cndmask_b32 v19, 0x80000000, v19, vcc +v_cmp_lt_u32 vcc, v22, s24 +v_cndmask_b32 v19, 0x80000000, v19, vcc +v_mad_u32_u24 v20, v24, s46, v19 +v_mad_u32_u24 v19, v21, s46, v19 +v_cmp_lt_u32 vcc, v24, s12 +v_cndmask_b32 v20, 0x80000000, v20, vcc +v_cmp_lt_u32 vcc, v21, s12 +v_cndmask_b32 v19, 0x80000000, v19, vcc +s_add_u32 s89, s28, 1 +s_lshr_b32 s89, s89, 1 +s_lshl_b32 s90, s89, 1 +s_add_u32 s91, s29, 1 +s_lshr_b32 s91, s91, 1 +s_lshl1_add_u32 s91, s91, 2 +s_pack_ll_b32_b16 s22, s91, s89 +s_pack_ll_b32_b16 s34, s11, s10 +s_sub_u32 s35, s90, s26 +s_sub_u32 s88, s91, s27 +s_pack_ll_b32_b16 s35, s88, s35 +s_pack_ll_b32_b16 s37, s29, s28 +s_sub_u32 s88, s91, 1 +s_pack_ll_b32_b16 s38, s88, s90 +v_lshrrev_b32 v24, 16, s22 +v_bfi_b32 v25, 0xffff, s22, 0 +v_and_b32 v27, 1, v0 +v_bfe_u32 v33, v0, 6, 1 +v_and_b32 v22, 63, v0 +v_mad_u32_u16 v28, 0x7c, s1, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v33, 2, s5, v33 op_sel:[0,0,0,0] +v_mad_u32_u16 v26, v24, v25, 0 op_sel:[0,0,0,0] +v_cmp_eq_u32 vcc, 0, v27 +v_cndmask_b32 v34, v26, v25, vcc +v_mad_u32_u16 v23, 62, v33, v22 op_sel:[0,0,0,0] +v_cndmask_b32 v23, v28, v23, vcc +v_clz_i32_u32 v40, v34 +v_lshlrev_b32 v41, v40, v34 +v_and_b32 v39, 0xffffff00, v41 +v_cmp_eq_u32 vcc, 0x80000000, v41 +v_cvt_f32_u32 v39, v39 +v_rcp_f32 v35, v39 +v_sub_co_ci_u32 v36, vcc, 32, v40, vcc +v_cvt_f32_ubyte0 v40, v41 +v_fma_f32 v39, v39, v35, -1.0 +v_fma_f32 v39, v40, v35, v39 +v_fmaak_f32 v39, v39, v35, 0x9f000000 +v_mul_f32 v39, 0x5f800000, v39 +v_mov_b32 v40, 0 +v_cvt_floor_i32_f32 v39, -v39 +v_lshl_add_u32 v35, v35, 9, v39 +v_mad_u64_u32 v[40:41], vcc, v41, v35, v[40:41] +v_sub_co_ci_u32 v35, vcc, v35, -1, vcc +v_mov_b32 v38, v36 quad_perm:[1,1,1,1] +v_mov_b32 v36, v36 quad_perm:[0,0,0,0] +v_mov_b32 v37, v35 quad_perm:[1,1,1,1] +v_mov_b32 v35, v35 quad_perm:[0,0,0,0] +v_mul_hi_u32 v39, v23, v35 +v_add_co_u32 v21, vcc, v39, v23 +v_add_co_ci_u32 v39, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v36 +v_cndmask_b32 v21, v21, v39, vcc +v_alignbit_b32 v21, v39, v21, v36 +v_mul_hi_u32 v39, v23, v37 +v_add_co_u32 v4, vcc, v39, v23 +v_add_co_ci_u32 v39, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v38 +v_cndmask_b32 v4, v4, v39, vcc +v_alignbit_b32 v4, v39, v4, v38 +v_mad_u32_u16 v32, v21, v25, 0 op_sel:[0,0,0,0] +v_mad_u32_u16 v31, v4, v24, 0 op_sel:[0,0,0,0] +v_sub_nc_u32 v32, v23, v32 +v_sub_nc_u32 v31, v21, v31 +v_readlane_b32 s92, v32, 1 +v_sub_nc_u32 v32, v32, v25 +v_readlane_b32 s23, v31, 1 +v_sub_nc_u32 v31, v31, v24 +v_readlane_b32 s15, v4, 1 +v_sub_nc_u32 v4, v4, s8 +s_lshl_b32 s23, s23, 16 +s_and_b32 s92, s92, 0xffff +s_add_u32 s23, s23, s92 +v_mov_b32 v32, v32 quad_perm:[0,0,2,2] +v_mov_b32 v31, v31 quad_perm:[0,0,2,2] +v_mov_b32 v4, v4 quad_perm:[0,0,2,2] +v_add_co_u32 v32, vcc, v32, v27 +v_cndmask_b32 v30, 0, v25, vcc +v_add_co_ci_u32 v31, vcc, v31, 0, vcc +v_cndmask_b32 v29, 0, v24, vcc +v_add_co_ci_u32 v4, vcc, v4, 0, vcc +v_min_u32 v27, v22, 63 +v_sub_nc_u32 v32, v32, v30 +v_sub_nc_u32 v31, v31, v29 +v_cmp_eq_u32 vcc, v22, v27 +v_lshlrev_b32 v5, 16, v31 +v_bfi_b32 v5, 0xffff, v32, v5 +v_add_nc_u32 v42, v4, s8 +v_med3_u32 v27, v22, 1, 62 +v_mul_lo_u32 v6, v42, s42 +v_mul_lo_u32 v11, v42, s50 +s_mul_i32 s36, s15, s42 +s_mul_i32 s39, s15, s50 +v_cndmask_b32 v6, 0x80000000, v6, vcc +v_cmp_eq_u32 vcc, v22, v27 +v_cndmask_b32 v11, 0x80000000, v11, vcc +v_cmp_ge_u32 s[54:55], v42, s8 +v_cndmask_b32 v6, v6, 0x80000000, s[54:55] +v_cndmask_b32 v11, v11, 0x80000000, s[54:55] +s_mov_b32 s49, 1 +s_lshl_b32 s53, s49, 9 +v_add_nc_u32 v15, s53, v14 +s_bfe_u32 s10, s58, 0x80008 +s_bfe_u32 s11, s58, 0x80010 +s_cmp_eq_u32 s11, 0 +s_cmov_b32 s26, 0 +s_cbranch_scc1 108 +s_add_u32 s11, s11, 0xffffff00 +s_add_u32 s60, s60, 0 +s_addc_u32 s61, s61, 0 +s_lshr_b32 s91, s13, 2 +s_or_b32 s91, s91, 0x21010000 +v_cmp_eq_u32 vcc, v0, 0x100 +s_cmp_eq_u64 vcc, 0 +s_cselect_b32 s91, 0, s91 +s_cselect_b32 s90, 0, 0x1010101 +s_sub_u32 s10, 0, s10 +s_mov_b64 s[88:89], s[60:61] +s_and_b32 s89, s89, 0xffff +s_or_b32 s89, s89, 0x40000 +s_and_b32 s29, s22, 0xffff +s_lshr_b32 s28, s22, 16 +s_lshr_b32 s29, s29, 1 +s_mul_i32 s27, s29, s28 +s_mul_i32 s27, s27, s8 +s_add_u32 s27, s27, 61 +v_writelane_b32 v22, 62, 0 +v_writelane_b32 v22, s1, 1 +v_writelane_b32 v22, 10, 2 +v_clz_i32_u32 v26, v22 +v_lshlrev_b32 v27, v26, v22 +v_and_b32 v28, 0xffffff00, v27 +v_cmp_eq_u32 vcc, 0x80000000, v27 +v_cvt_f32_u32 v28, v28 +v_rcp_f32 v24, v28 +v_sub_co_ci_u32 v25, vcc, 32, v26, vcc +v_cvt_f32_ubyte0 v26, v27 +v_fma_f32 v28, v28, v24, -1.0 +v_fma_f32 v28, v26, v24, v28 +v_fmaak_f32 v28, v28, v24, 0x9f000000 +v_mul_f32 v28, 0x5f800000, v28 +v_mov_b32 v26, 0 +v_cvt_floor_i32_f32 v28, -v28 +v_lshl_add_u32 v24, v24, 9, v28 +v_mad_u64_u32 v[26:27], vcc, v27, v24, v[26:27] +v_sub_co_ci_u32 v24, vcc, v24, -1, vcc +v_mul_hi_u32 v26, s27, v24 +v_add_co_u32 v23, vcc, v26, s27 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v23, v23, v26, vcc +v_alignbit_b32 v23, v26, v23, v25 +v_mov_b32 v23, v23 quad_perm:[0,0,0,0] +v_mul_hi_u32 v26, v23, v24 +v_add_co_u32 v22, vcc, v26, v23 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v22, v22, v26, vcc +v_alignbit_b32 v22, v26, v22, v25 +v_mov_b32 v22, v22 quad_perm:[1,1,1,1] +v_add_nc_u32 v23, v22, 9 +v_mul_hi_u32 v26, v23, v24 +v_add_co_u32 v23, vcc, v26, v23 +v_add_co_ci_u32 v26, vcc, 0, 0, vcc +v_cmp_eq_u32 vcc, 32, v25 +v_cndmask_b32 v23, v23, v26, vcc +v_alignbit_b32 v23, v26, v23, v25 +v_readlane_b32 s28, v22, 1 +v_readlane_b32 s29, v23, 2 +s_add_u32 s27, s9, 31 +s_lshr_b32 s27, s27, 5 +s_cmp_eq_u32 s27, 1 +s_cmov_b32 s29, 1 +s_add_u32 s26, s28, s29 +s_mul_i32 s26, s27, s26 +s_add_u32 s26, 4, s26 +s_sub_u32 s26, s26, 1 +s_mov_b32 s92, 0 +s_mov_b32 s93, 0 +s_mov_b32 s94, 0 +s_mov_b32 s95, 0 +s_mov_b32 s28, 0 +s_mov_b32 s27, 8 +s_cmp_gt_u32 s28, 0 +s_cbranch_scc1 4 +v_mov_b32 v58, v4 +v_mov_b32 v63, v5 +v_mov_b32 v225, v6 +v_mov_b32 v226, v11 +v_mov_b32 v4, v58 +v_mov_b32 v5, v63 +v_mov_b32 v6, v225 +v_mov_b32 v11, v226 +s_add_u32 s28, s28, 32 +s_cmp_ge_u32 s28, s9 +s_cmov_b32 s28, 0 +s_cselect_b32 s29, 6, 2 +s_cselect_b32 s96, 9, 0 +s_pack_lh_b32_b16 s29, s29, s27 +s_pack_ll_b32_b16 s96, s96, s28 +v_mov_b32 v224, s29 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v44, v49, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v45, v50, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v46, v51, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v47, v52, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v7, v19 +v_mov_b32 v8, v20 +v_mov_b32 v9, 0x80000000 +v_mov_b32 v10, 0x80000000 +v_mov_b32 v12, 0x80000000 +v_mov_b32 v13, 0x80000000 +s_setprio 0 +ds_load_b128 v[34:37], v3 +ds_store_b128 v16, v[7:10] offset:37120 +ds_load_b128 v[39:42], v3 offset:576 +ds_store_b32 v17, v224 offset:39168 +s_setprio 2 +s_sub_u32 s26, s26, 1 +s_cselect_b32 s91, 0x21010000, s91 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x4b18 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_add_nc_u32 v15, s53, v14 +v_mov_b32 v245, v243 +v_mov_b32 v246, v244 +v_pk_fma_f16 v227, v34, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v228, v35, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v229, v36, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v230, v37, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 34, 285, 290, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 35, 286, 291, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 36, 287, 292, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 288, 293, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 231, 290, 295, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 291, 296, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 292, 297, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 293, 298, 0x0, 0x3, 0x0, 0x0 +s_setprio 0 +ds_load_b64 v[243:244], v15 offset:39680 +ds_load_b128 v[54:57], v3 offset:2304 +ds_load_b128 v[59:62], v3 offset:2880 +s_barrier +s_nop 15 +s_setprio 2 +s_mov_b32 s92, s93 +s_mov_b32 s93, s94 +s_mov_b32 s94, s95 +s_mov_b32 s95, s27 +s_bitcmp1_b32 s92, 0 +s_cbranch_scc1 2827 +s_sub_u32 s49, s49, 1 +s_cselect_b32 s49, 1, s49 +s_lshl_b32 s53, s49, 9 +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x4b1c +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s75, 0x11014000, 0 +s_sub_u32 s69, s12, 1 +s_cselect_b32 s75, 0, s75 +s_mov_b64 s[72:73], s[20:21] +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v235, v54, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v236, v55, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v237, v56, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v238, v57, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 54, 305, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 55, 306, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 56, 307, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 57, 308, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 239, 310, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 311, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 312, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 313, 318, 0x0, 0x3, 0x0, 0x0 +s_add_u32 s11, s11, 0x100 +s_cbranch_scc0 7 +s_bitset0_b32 s91, 23 +s_lshl_b64 exec, 1, s90 +buffer_store_b8 v0, off, s[88:91], s4 +s_mov_b64 exec, -1 +s_mul_i32 s11, s11, 0xffffff01 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 3 +s_bitset1_b32 s91, 23 +buffer_load_b32 v21, off, s[88:91], null glc +s_setprio 0 +s_nop 1 +ds_load_b128 v[24:27], v3 offset:9280 +ds_store_b64 v15, v[12:13] offset:39680 +ds_load_b128 v[29:32], v3 offset:9856 +ds_load_b32 v224, v18 offset:39168 +s_setprio 2 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x4b18 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s27, v224 +v_pk_fma_f16 v24, v29, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v25, v30, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v26, v31, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v27, v32, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +s_setprio 0 +ds_load_b128 v[44:47], v3 offset:11584 +ds_load_b128 v[49:52], v3 offset:12160 +s_barrier +s_nop 15 +s_setprio 2 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 25 +s_pack_ll_b32_b16 s10, s10, s10 +s_bfm_b64 exec, s91, 0 +v_cmp_ne_u32 vcc, v21, s90 +s_cbranch_vccz 12 +buffer_load_b32 v21, off, s[88:91], null glc +s_cmp_eq_u32 s10, 0 +s_cselect_b32 vcc_lo, 0, 0x10000 +s_add_u32 s10, s10, vcc_lo +s_cbranch_scc1 2 +s_waitcnt vmcnt(0) +s_branch 65524 +s_and_b32 s91, 0xffff0000, s91 +s_mov_b32 s10, 0 +s_mov_b64 exec, -1 +s_mul_i32 s90, s90, 3 +s_and_b32 s90, s90, 0x3f3f3f3f +s_add_u32 s88, s88, 0x100 +s_and_b32 s88, s88, 0xfffff7ff +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x4c7c +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_cmp_le_u32 s9, 32 +s_cselect_b32 s97, -1, 9 +s_sub_u32 s97, s97, 1 +s_cselect_b32 s29, s96, s29 +s_bitset0_b32 s29, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v44, v49, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v45, v50, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v46, v51, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v47, v52, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_mov_b32 v224, s29 +v_add_co_u32 v33, vcc, v5, s23 +v_pk_mad_u16 v23, v5, 0x20001, s35 +v_pk_mad_u16 v28, v5, 0x20001, s38 +_v_pk_min_u16__vop3p 22, 289, 261, 0x0, 0x3, 0x0, 0x0 +v_cndmask_b32 v43, 0, s42, vcc +v_cndmask_b32 v247, 0, s50, vcc +v_mad_u32_u16 v7, v23, 1, v6 op_sel:[0,0,0,0] +v_mad_u32_u16 v12, v28, 1, v11 op_sel:[0,0,0,0] +v_add3_u32 v6, v6, s36, v43 +v_add3_u32 v11, v11, s39, v247 +_v_pk_sub_u16__vop3p 22, 261, 278, 0x0, 0x3, 0x0, 0x0 +v_add_co_ci_u32 v4, s[54:55], v4, s15, vcc +v_cndmask_b32 v6, v6, 0x80000000, s[54:55] +v_cndmask_b32 v11, v11, 0x80000000, s[54:55] +v_cmp_lt_u16 vcc, v23, s34 +v_cndmask_b32 v7, 0x80000000, v7, vcc +v_cmp_lt_u16 vcc, v28, s37 +v_cndmask_b32 v12, 0x80000000, v12, vcc +_v_pk_ashrrev_i16__vop3p 22, 143, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_u16__vop3p 53, 279, 41, 0x1, 0x3, 0x0, 0x0 +_v_pk_add_u16__vop3p 48, 279, 40, 0x1, 0x3, 0x0, 0x0 +v_mad_u32_u16 v10, v53, s44, v7 op_sel:[1,0,0,0] +v_mad_u32_u16 v8, v48, s44, v7 op_sel:[1,0,0,0] +_v_pk_add_u16__vop3p 38, 284, 45, 0x1, 0x3, 0x0, 0x0 +_v_cmp_lt_u16__vop3 106, 53, 34, 0x3 +v_cndmask_b32 v10, 0x80000000, v10, vcc +_v_cmp_lt_u16__vop3 106, 48, 34, 0x3 +v_cndmask_b32 v8, 0x80000000, v8, vcc +v_mad_u32_u16 v13, v38, s52, v12 op_sel:[1,0,0,0] +v_mad_u32_u16 v9, v53, s44, v7 op_sel:[0,0,0,0] +v_mad_u32_u16 v7, v48, s44, v7 op_sel:[0,0,0,0] +_v_cmp_lt_u16__vop3 106, 38, 37, 0x3 +v_cndmask_b32 v13, 0x80000000, v13, vcc +_v_cmp_lt_u16__vop3 106, 53, 34, 0x2 +v_cndmask_b32 v9, 0x80000000, v9, vcc +_v_cmp_lt_u16__vop3 106, 48, 34, 0x2 +v_cndmask_b32 v7, 0x80000000, v7, vcc +v_mad_u32_u16 v12, v38, s52, v12 op_sel:[0,0,0,0] +v_pk_mad_u16 v5, v22, s22, v33 +_v_cmp_lt_u16__vop3 106, 38, 37, 0x2 +v_cndmask_b32 v12, 0x80000000, v12, vcc +v_add_co_u32 v22, vcc, v4, s8 +v_cndmask_b32 v224, s96, v224, vcc +s_setprio 0 +ds_load_b128 v[34:37], v3 +ds_store_b128 v16, v[7:10] offset:37120 +ds_load_b128 v[39:42], v3 offset:576 +ds_store_b32 v17, v224 offset:39168 +s_setprio 2 +s_sub_u32 s26, s26, 1 +s_cselect_b32 s91, 0x21010000, s91 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x4b18 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_add_nc_u32 v15, s53, v14 +v_mov_b32 v245, v243 +v_mov_b32 v246, v244 +v_pk_fma_f16 v227, v34, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v228, v35, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v229, v36, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v230, v37, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 34, 285, 290, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 35, 286, 291, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 36, 287, 292, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 37, 288, 293, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 231, 290, 295, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 291, 296, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 292, 297, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 293, 298, 0x0, 0x3, 0x0, 0x0 +s_setprio 0 +ds_load_b64 v[243:244], v15 offset:39680 +ds_load_b128 v[54:57], v3 offset:2304 +ds_load_b128 v[59:62], v3 offset:2880 +s_barrier +s_nop 15 +s_setprio 2 +s_mov_b32 s92, s93 +s_mov_b32 s93, s94 +s_mov_b32 s94, s95 +s_mov_b32 s95, s27 +s_bitcmp1_b32 s92, 0 +s_cbranch_scc1 2533 +s_sub_u32 s49, s49, 1 +s_cselect_b32 s49, 1, s49 +s_lshl_b32 s53, s49, 9 +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x4b1c +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s75, 0x11014000, 0 +s_sub_u32 s69, s12, 1 +s_cselect_b32 s75, 0, s75 +s_mov_b64 s[72:73], s[20:21] +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +s_barrier +v_pk_fma_f16 v235, v54, s82, v44 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v236, v55, s82, v45 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v237, v56, s82, v46 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v238, v57, s82, v47 op_sel:[0,1,0] op_sel_hi:[1,1,1] +_v_pk_add_f16__vop3p 54, 305, 310, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 55, 306, 311, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 56, 307, 312, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 57, 308, 313, 0x0, 0x3, 0x2, 0x2 +_v_pk_add_f16__vop3p 239, 310, 315, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 311, 316, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 312, 317, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 313, 318, 0x0, 0x3, 0x0, 0x0 +s_add_u32 s11, s11, 0x100 +s_cbranch_scc0 7 +s_bitset0_b32 s91, 23 +s_lshl_b64 exec, 1, s90 +buffer_store_b8 v0, off, s[88:91], s4 +s_mov_b64 exec, -1 +s_mul_i32 s11, s11, 0xffffff01 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 3 +s_bitset1_b32 s91, 23 +buffer_load_b32 v21, off, s[88:91], null glc +s_setprio 0 +s_nop 1 +ds_load_b128 v[24:27], v3 offset:9280 +ds_store_b64 v15, v[12:13] offset:39680 +ds_load_b128 v[29:32], v3 offset:9856 +ds_load_b32 v224, v18 offset:39168 +s_setprio 2 +s_bitcmp1_b32 s92, 2 +s_cselect_b32 s86, s84, 0x4b18 +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_swappc_b64 s[86:87], s[86:87] +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s27, v224 +v_pk_fma_f16 v24, v29, s82, v24 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v25, v30, s82, v25 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v26, v31, s82, v26 op_sel:[0,1,0] op_sel_hi:[1,1,1] +v_pk_fma_f16 v27, v32, s82, v27 op_sel:[0,1,0] op_sel_hi:[1,1,1] +s_setprio 0 +ds_load_b128 v[44:47], v3 offset:11584 +ds_load_b128 v[49:52], v3 offset:12160 +s_barrier +s_nop 15 +s_setprio 2 +s_and_not1_b32 null, 0xffffff00, s11 +s_cbranch_scc1 25 +s_pack_ll_b32_b16 s10, s10, s10 +s_bfm_b64 exec, s91, 0 +v_cmp_ne_u32 vcc, v21, s90 +s_cbranch_vccz 12 +buffer_load_b32 v21, off, s[88:91], null glc +s_cmp_eq_u32 s10, 0 +s_cselect_b32 vcc_lo, 0, 0x10000 +s_add_u32 s10, s10, vcc_lo +s_cbranch_scc1 2 +s_waitcnt vmcnt(0) +s_branch 65524 +s_and_b32 s91, 0xffff0000, s91 +s_mov_b32 s10, 0 +s_mov_b64 exec, -1 +s_mul_i32 s90, s90, 3 +s_and_b32 s90, s90, 0x3f3f3f3f +s_add_u32 s88, s88, 0x100 +s_and_b32 s88, s88, 0xfffff7ff +s_bitcmp1_b32 s92, 1 +s_cselect_b32 s86, s85, 0x4c7c +s_add_u32 s86, s6, s86 +s_addc_u32 s87, s7, 0 +s_bitcmp1_b32 s27, 1 +s_cbranch_scc1 65242 +s_branch 65012 +s_setpc_b64 s[86:87] +s_bitcmp1_b32 s92, 3 +s_cbranch_scc0 80 +v_mov_b32 v64, 0 +v_mov_b32 v68, 0 +v_mov_b32 v65, 0 +v_mov_b32 v69, 0 +v_mov_b32 v66, 0 +v_mov_b32 v70, 0 +v_mov_b32 v67, 0 +v_mov_b32 v71, 0 +v_mov_b32 v80, 0 +v_mov_b32 v84, 0 +v_mov_b32 v81, 0 +v_mov_b32 v85, 0 +v_mov_b32 v82, 0 +v_mov_b32 v86, 0 +v_mov_b32 v83, 0 +v_mov_b32 v87, 0 +v_mov_b32 v96, 0 +v_mov_b32 v100, 0 +v_mov_b32 v97, 0 +v_mov_b32 v101, 0 +v_mov_b32 v98, 0 +v_mov_b32 v102, 0 +v_mov_b32 v99, 0 +v_mov_b32 v103, 0 +v_mov_b32 v112, 0 +v_mov_b32 v116, 0 +v_mov_b32 v113, 0 +v_mov_b32 v117, 0 +v_mov_b32 v114, 0 +v_mov_b32 v118, 0 +v_mov_b32 v115, 0 +v_mov_b32 v119, 0 +v_mov_b32 v128, 0 +v_mov_b32 v132, 0 +v_mov_b32 v129, 0 +v_mov_b32 v133, 0 +v_mov_b32 v130, 0 +v_mov_b32 v134, 0 +v_mov_b32 v131, 0 +v_mov_b32 v135, 0 +v_mov_b32 v144, 0 +v_mov_b32 v148, 0 +v_mov_b32 v145, 0 +v_mov_b32 v149, 0 +v_mov_b32 v146, 0 +v_mov_b32 v150, 0 +v_mov_b32 v147, 0 +v_mov_b32 v151, 0 +v_mov_b32 v160, 0 +v_mov_b32 v164, 0 +v_mov_b32 v161, 0 +v_mov_b32 v165, 0 +v_mov_b32 v162, 0 +v_mov_b32 v166, 0 +v_mov_b32 v163, 0 +v_mov_b32 v167, 0 +v_mov_b32 v176, 0 +v_mov_b32 v180, 0 +v_mov_b32 v177, 0 +v_mov_b32 v181, 0 +v_mov_b32 v178, 0 +v_mov_b32 v182, 0 +v_mov_b32 v179, 0 +v_mov_b32 v183, 0 +v_mov_b32 v192, 0 +v_mov_b32 v196, 0 +v_mov_b32 v193, 0 +v_mov_b32 v197, 0 +v_mov_b32 v194, 0 +v_mov_b32 v198, 0 +v_mov_b32 v195, 0 +v_mov_b32 v199, 0 +v_mov_b32 v208, 0 +v_mov_b32 v212, 0 +v_mov_b32 v209, 0 +v_mov_b32 v213, 0 +v_mov_b32 v210, 0 +v_mov_b32 v214, 0 +v_mov_b32 v211, 0 +v_mov_b32 v215, 0 +s_mov_b32 s85, 0x4ddc +s_cmp_le_u32 s9, 32 +s_cmov_b32 s85, 0x4b18 +s_setpc_b64 s[86:87] +s_bitcmp1_b32 s92, 3 +s_cbranch_scc0 80 +v_mov_b32 v72, 0 +v_mov_b32 v76, 0 +v_mov_b32 v73, 0 +v_mov_b32 v77, 0 +v_mov_b32 v74, 0 +v_mov_b32 v78, 0 +v_mov_b32 v75, 0 +v_mov_b32 v79, 0 +v_mov_b32 v88, 0 +v_mov_b32 v92, 0 +v_mov_b32 v89, 0 +v_mov_b32 v93, 0 +v_mov_b32 v90, 0 +v_mov_b32 v94, 0 +v_mov_b32 v91, 0 +v_mov_b32 v95, 0 +v_mov_b32 v104, 0 +v_mov_b32 v108, 0 +v_mov_b32 v105, 0 +v_mov_b32 v109, 0 +v_mov_b32 v106, 0 +v_mov_b32 v110, 0 +v_mov_b32 v107, 0 +v_mov_b32 v111, 0 +v_mov_b32 v120, 0 +v_mov_b32 v124, 0 +v_mov_b32 v121, 0 +v_mov_b32 v125, 0 +v_mov_b32 v122, 0 +v_mov_b32 v126, 0 +v_mov_b32 v123, 0 +v_mov_b32 v127, 0 +v_mov_b32 v136, 0 +v_mov_b32 v140, 0 +v_mov_b32 v137, 0 +v_mov_b32 v141, 0 +v_mov_b32 v138, 0 +v_mov_b32 v142, 0 +v_mov_b32 v139, 0 +v_mov_b32 v143, 0 +v_mov_b32 v152, 0 +v_mov_b32 v156, 0 +v_mov_b32 v153, 0 +v_mov_b32 v157, 0 +v_mov_b32 v154, 0 +v_mov_b32 v158, 0 +v_mov_b32 v155, 0 +v_mov_b32 v159, 0 +v_mov_b32 v168, 0 +v_mov_b32 v172, 0 +v_mov_b32 v169, 0 +v_mov_b32 v173, 0 +v_mov_b32 v170, 0 +v_mov_b32 v174, 0 +v_mov_b32 v171, 0 +v_mov_b32 v175, 0 +v_mov_b32 v184, 0 +v_mov_b32 v188, 0 +v_mov_b32 v185, 0 +v_mov_b32 v189, 0 +v_mov_b32 v186, 0 +v_mov_b32 v190, 0 +v_mov_b32 v187, 0 +v_mov_b32 v191, 0 +v_mov_b32 v200, 0 +v_mov_b32 v204, 0 +v_mov_b32 v201, 0 +v_mov_b32 v205, 0 +v_mov_b32 v202, 0 +v_mov_b32 v206, 0 +v_mov_b32 v203, 0 +v_mov_b32 v207, 0 +v_mov_b32 v216, 0 +v_mov_b32 v220, 0 +v_mov_b32 v217, 0 +v_mov_b32 v221, 0 +v_mov_b32 v218, 0 +v_mov_b32 v222, 0 +v_mov_b32 v219, 0 +v_mov_b32 v223, 0 +s_mov_b32 s85, 0x4ddc +s_cmp_le_u32 s9, 32 +s_cmov_b32 s85, 0x4b18 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 320, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 321, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 322, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 323, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 324, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 325, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 326, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 327, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v64, v227 +v_mov_b32 v65, v228 +v_mov_b32 v66, v229 +v_mov_b32 v67, v230 +v_mov_b32 v68, v231 +v_mov_b32 v69, v232 +v_mov_b32 v70, v233 +v_mov_b32 v71, v234 +s_mov_b32 s85, 0x4e48 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 328, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 329, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 330, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 331, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 332, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 333, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 334, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 335, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v72, v235 +v_mov_b32 v73, v236 +v_mov_b32 v74, v237 +v_mov_b32 v75, v238 +v_mov_b32 v76, v239 +v_mov_b32 v77, v240 +v_mov_b32 v78, v241 +v_mov_b32 v79, v242 +s_mov_b32 s85, 0x4eb4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 336, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 337, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 338, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 339, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 340, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 341, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 342, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 343, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v80, v227 +v_mov_b32 v81, v228 +v_mov_b32 v82, v229 +v_mov_b32 v83, v230 +v_mov_b32 v84, v231 +v_mov_b32 v85, v232 +v_mov_b32 v86, v233 +v_mov_b32 v87, v234 +s_mov_b32 s85, 0x4f20 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 344, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 345, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 346, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 347, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 348, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 349, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 350, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 351, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v88, v235 +v_mov_b32 v89, v236 +v_mov_b32 v90, v237 +v_mov_b32 v91, v238 +v_mov_b32 v92, v239 +v_mov_b32 v93, v240 +v_mov_b32 v94, v241 +v_mov_b32 v95, v242 +s_mov_b32 s85, 0x4f8c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 352, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 353, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 354, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 355, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 356, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 357, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 358, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 359, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v96, v227 +v_mov_b32 v97, v228 +v_mov_b32 v98, v229 +v_mov_b32 v99, v230 +v_mov_b32 v100, v231 +v_mov_b32 v101, v232 +v_mov_b32 v102, v233 +v_mov_b32 v103, v234 +s_mov_b32 s85, 0x4ff8 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 360, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 361, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 362, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 363, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 364, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 365, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 366, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 367, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v104, v235 +v_mov_b32 v105, v236 +v_mov_b32 v106, v237 +v_mov_b32 v107, v238 +v_mov_b32 v108, v239 +v_mov_b32 v109, v240 +v_mov_b32 v110, v241 +v_mov_b32 v111, v242 +s_mov_b32 s85, 0x5064 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 368, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 369, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 370, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 371, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 372, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 373, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 374, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 375, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v112, v227 +v_mov_b32 v113, v228 +v_mov_b32 v114, v229 +v_mov_b32 v115, v230 +v_mov_b32 v116, v231 +v_mov_b32 v117, v232 +v_mov_b32 v118, v233 +v_mov_b32 v119, v234 +s_mov_b32 s85, 0x50d0 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 376, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 377, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 378, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 379, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 380, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 381, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 382, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 383, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v120, v235 +v_mov_b32 v121, v236 +v_mov_b32 v122, v237 +v_mov_b32 v123, v238 +v_mov_b32 v124, v239 +v_mov_b32 v125, v240 +v_mov_b32 v126, v241 +v_mov_b32 v127, v242 +s_mov_b32 s85, 0x513c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 384, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 385, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 386, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 387, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 388, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 389, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 390, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 391, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v128, v227 +v_mov_b32 v129, v228 +v_mov_b32 v130, v229 +v_mov_b32 v131, v230 +v_mov_b32 v132, v231 +v_mov_b32 v133, v232 +v_mov_b32 v134, v233 +v_mov_b32 v135, v234 +s_mov_b32 s85, 0x51a8 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 392, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 393, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 394, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 395, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 396, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 397, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 398, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 399, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v136, v235 +v_mov_b32 v137, v236 +v_mov_b32 v138, v237 +v_mov_b32 v139, v238 +v_mov_b32 v140, v239 +v_mov_b32 v141, v240 +v_mov_b32 v142, v241 +v_mov_b32 v143, v242 +s_mov_b32 s85, 0x5214 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 400, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 401, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 402, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 403, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 404, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 405, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 406, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 407, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v144, v227 +v_mov_b32 v145, v228 +v_mov_b32 v146, v229 +v_mov_b32 v147, v230 +v_mov_b32 v148, v231 +v_mov_b32 v149, v232 +v_mov_b32 v150, v233 +v_mov_b32 v151, v234 +s_mov_b32 s85, 0x5280 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 408, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 409, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 410, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 411, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 412, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 413, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 414, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 415, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v152, v235 +v_mov_b32 v153, v236 +v_mov_b32 v154, v237 +v_mov_b32 v155, v238 +v_mov_b32 v156, v239 +v_mov_b32 v157, v240 +v_mov_b32 v158, v241 +v_mov_b32 v159, v242 +s_mov_b32 s85, 0x52ec +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 416, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 417, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 418, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 419, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 420, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 421, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 422, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 423, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v160, v227 +v_mov_b32 v161, v228 +v_mov_b32 v162, v229 +v_mov_b32 v163, v230 +v_mov_b32 v164, v231 +v_mov_b32 v165, v232 +v_mov_b32 v166, v233 +v_mov_b32 v167, v234 +s_mov_b32 s85, 0x5358 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 424, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 425, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 426, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 427, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 428, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 429, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 430, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 431, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v168, v235 +v_mov_b32 v169, v236 +v_mov_b32 v170, v237 +v_mov_b32 v171, v238 +v_mov_b32 v172, v239 +v_mov_b32 v173, v240 +v_mov_b32 v174, v241 +v_mov_b32 v175, v242 +s_mov_b32 s85, 0x53c4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 432, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 433, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 434, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 435, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 436, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 437, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 438, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 439, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v176, v227 +v_mov_b32 v177, v228 +v_mov_b32 v178, v229 +v_mov_b32 v179, v230 +v_mov_b32 v180, v231 +v_mov_b32 v181, v232 +v_mov_b32 v182, v233 +v_mov_b32 v183, v234 +s_mov_b32 s85, 0x5430 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 440, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 441, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 442, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 443, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 444, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 445, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 446, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 447, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v184, v235 +v_mov_b32 v185, v236 +v_mov_b32 v186, v237 +v_mov_b32 v187, v238 +v_mov_b32 v188, v239 +v_mov_b32 v189, v240 +v_mov_b32 v190, v241 +v_mov_b32 v191, v242 +s_mov_b32 s85, 0x549c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 448, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 449, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 450, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 451, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 452, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 453, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 454, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 455, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v192, v227 +v_mov_b32 v193, v228 +v_mov_b32 v194, v229 +v_mov_b32 v195, v230 +v_mov_b32 v196, v231 +v_mov_b32 v197, v232 +v_mov_b32 v198, v233 +v_mov_b32 v199, v234 +s_mov_b32 s85, 0x5508 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 456, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 457, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 458, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 459, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 460, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 461, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 462, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 463, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v200, v235 +v_mov_b32 v201, v236 +v_mov_b32 v202, v237 +v_mov_b32 v203, v238 +v_mov_b32 v204, v239 +v_mov_b32 v205, v240 +v_mov_b32 v206, v241 +v_mov_b32 v207, v242 +s_mov_b32 s85, 0x5574 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 464, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 465, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 466, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 467, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 468, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 469, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 470, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 471, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v208, v227 +v_mov_b32 v209, v228 +v_mov_b32 v210, v229 +v_mov_b32 v211, v230 +v_mov_b32 v212, v231 +v_mov_b32 v213, v232 +v_mov_b32 v214, v233 +v_mov_b32 v215, v234 +s_mov_b32 s85, 0x55e0 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 472, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 473, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 474, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 475, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 476, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 477, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 478, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 479, 0x0, 0x3, 0x0, 0x0 +v_mov_b32 v216, v235 +v_mov_b32 v217, v236 +v_mov_b32 v218, v237 +v_mov_b32 v219, v238 +v_mov_b32 v220, v239 +v_mov_b32 v221, v240 +v_mov_b32 v222, v241 +v_mov_b32 v223, v242 +s_mov_b32 s85, 0x4ddc +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 483, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 484, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 485, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 486, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 227, 483, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 228, 484, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 229, 485, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 230, 486, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 487, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 488, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 489, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 490, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 231, 487, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 232, 488, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 233, 489, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 234, 490, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x5818 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 491, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 492, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 493, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 494, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 235, 491, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 236, 492, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 237, 493, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 238, 494, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 495, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 496, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 497, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 498, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 239, 495, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 240, 496, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 241, 497, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_max_f16__vop3p 242, 498, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x564c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 483, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 484, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 485, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 486, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 227, 483, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 228, 484, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 229, 485, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 230, 486, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 487, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 488, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 489, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 490, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 231, 487, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 232, 488, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 233, 489, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 234, 490, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x5bb0 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 491, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 492, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 493, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 494, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 235, 491, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 236, 492, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 237, 493, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 238, 494, 304, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 22, 32, 495, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 28, 32, 496, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 38, 32, 497, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 48, 32, 498, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 239, 495, 278, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 240, 496, 284, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 241, 497, 294, 0x0, 0x3, 0x0, 0x0 +_v_pk_min_f16__vop3p 242, 498, 304, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x59e4 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 227, 0xbdc5bdc5, 483, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 228, 0xbdc5bdc5, 484, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 229, 0xbdc5bdc5, 485, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 230, 0xbdc5bdc5, 486, 0x0, 0x3 +v_exp_f16 v227, v227 +v_exp_f16 v228, v228 +v_exp_f16 v229, v229 +v_exp_f16 v230, v230 +_v_exp_f16__vop3 227, 227, 0x9 +_v_exp_f16__vop3 228, 228, 0x9 +_v_exp_f16__vop3 229, 229, 0x9 +_v_exp_f16__vop3 230, 230, 0x9 +_v_pk_add_f16__vop3p_lit 227, 0x3c003c00, 483, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 228, 0x3c003c00, 484, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 229, 0x3c003c00, 485, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 230, 0x3c003c00, 486, 0x0, 0x3 +v_rcp_f16 v227, v227 +v_rcp_f16 v228, v228 +v_rcp_f16 v229, v229 +v_rcp_f16 v230, v230 +_v_rcp_f16__vop3 227, 227, 0x9 +_v_rcp_f16__vop3 228, 228, 0x9 +_v_rcp_f16__vop3 229, 229, 0x9 +_v_rcp_f16__vop3 230, 230, 0x9 +_v_pk_mul_f16__vop3p_lit 231, 0xbdc5bdc5, 487, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 232, 0xbdc5bdc5, 488, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 233, 0xbdc5bdc5, 489, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 234, 0xbdc5bdc5, 490, 0x0, 0x3 +v_exp_f16 v231, v231 +v_exp_f16 v232, v232 +v_exp_f16 v233, v233 +v_exp_f16 v234, v234 +_v_exp_f16__vop3 231, 231, 0x9 +_v_exp_f16__vop3 232, 232, 0x9 +_v_exp_f16__vop3 233, 233, 0x9 +_v_exp_f16__vop3 234, 234, 0x9 +_v_pk_add_f16__vop3p_lit 231, 0x3c003c00, 487, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 232, 0x3c003c00, 488, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 233, 0x3c003c00, 489, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 234, 0x3c003c00, 490, 0x0, 0x3 +v_rcp_f16 v231, v231 +v_rcp_f16 v232, v232 +v_rcp_f16 v233, v233 +v_rcp_f16 v234, v234 +_v_rcp_f16__vop3 231, 231, 0x9 +_v_rcp_f16__vop3 232, 232, 0x9 +_v_rcp_f16__vop3 233, 233, 0x9 +_v_rcp_f16__vop3 234, 234, 0x9 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x6088 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 235, 0xbdc5bdc5, 491, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 236, 0xbdc5bdc5, 492, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 237, 0xbdc5bdc5, 493, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 238, 0xbdc5bdc5, 494, 0x0, 0x3 +v_exp_f16 v235, v235 +v_exp_f16 v236, v236 +v_exp_f16 v237, v237 +v_exp_f16 v238, v238 +_v_exp_f16__vop3 235, 235, 0x9 +_v_exp_f16__vop3 236, 236, 0x9 +_v_exp_f16__vop3 237, 237, 0x9 +_v_exp_f16__vop3 238, 238, 0x9 +_v_pk_add_f16__vop3p_lit 235, 0x3c003c00, 491, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 236, 0x3c003c00, 492, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 237, 0x3c003c00, 493, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 238, 0x3c003c00, 494, 0x0, 0x3 +v_rcp_f16 v235, v235 +v_rcp_f16 v236, v236 +v_rcp_f16 v237, v237 +v_rcp_f16 v238, v238 +_v_rcp_f16__vop3 235, 235, 0x9 +_v_rcp_f16__vop3 236, 236, 0x9 +_v_rcp_f16__vop3 237, 237, 0x9 +_v_rcp_f16__vop3 238, 238, 0x9 +_v_pk_mul_f16__vop3p_lit 239, 0xbdc5bdc5, 495, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 240, 0xbdc5bdc5, 496, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 241, 0xbdc5bdc5, 497, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 242, 0xbdc5bdc5, 498, 0x0, 0x3 +v_exp_f16 v239, v239 +v_exp_f16 v240, v240 +v_exp_f16 v241, v241 +v_exp_f16 v242, v242 +_v_exp_f16__vop3 239, 239, 0x9 +_v_exp_f16__vop3 240, 240, 0x9 +_v_exp_f16__vop3 241, 241, 0x9 +_v_exp_f16__vop3 242, 242, 0x9 +_v_pk_add_f16__vop3p_lit 239, 0x3c003c00, 495, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 240, 0x3c003c00, 496, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 241, 0x3c003c00, 497, 0x0, 0x3 +_v_pk_add_f16__vop3p_lit 242, 0x3c003c00, 498, 0x0, 0x3 +v_rcp_f16 v239, v239 +v_rcp_f16 v240, v240 +v_rcp_f16 v241, v241 +v_rcp_f16 v242, v242 +_v_rcp_f16__vop3 239, 239, 0x9 +_v_rcp_f16__vop3 240, 240, 0x9 +_v_rcp_f16__vop3 241, 241, 0x9 +_v_rcp_f16__vop3 242, 242, 0x9 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x5d7c +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 227, 483, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 228, 484, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 229, 485, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 230, 486, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 231, 487, 56, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 232, 488, 57, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 233, 489, 59, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 234, 490, 64, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 227, 483, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 228, 484, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 229, 485, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 230, 486, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v227 +v_and_b32 v28, 0x7fff7fff, v228 +v_and_b32 v38, 0x7fff7fff, v229 +v_and_b32 v48, 0x7fff7fff, v230 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 227, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 228, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 229, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 230, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 227, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 228, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 229, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 230, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v227, 0x7fff7fff, v23, v227 +v_bfi_b32 v228, 0x7fff7fff, v33, v228 +v_bfi_b32 v229, 0x7fff7fff, v43, v229 +v_bfi_b32 v230, 0x7fff7fff, v53, v230 +_v_pk_mul_f16__vop3p 227, 483, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 228, 484, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 229, 485, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 230, 486, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 231, 487, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 232, 488, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 489, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 490, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v231 +v_and_b32 v28, 0x7fff7fff, v232 +v_and_b32 v38, 0x7fff7fff, v233 +v_and_b32 v48, 0x7fff7fff, v234 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 231, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 232, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 233, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 234, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 231, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 232, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 233, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 234, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v231, 0x7fff7fff, v23, v231 +v_bfi_b32 v232, 0x7fff7fff, v33, v232 +v_bfi_b32 v233, 0x7fff7fff, v43, v233 +v_bfi_b32 v234, 0x7fff7fff, v53, v234 +_v_pk_mul_f16__vop3p 231, 487, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 232, 488, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 233, 489, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 234, 490, 32, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v227, v245, s[72:75], 0 idxen +buffer_store_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v227, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v231, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v228, v245, s[72:75], 0 idxen +buffer_store_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v228, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v232, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v229, v245, s[72:75], 0 idxen +buffer_store_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v229, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v233, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v230, v245, s[72:75], 0 idxen +buffer_store_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v230, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v234, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x6a20 +s_setpc_b64 s[86:87] +_v_pk_add_f16__vop3p 235, 491, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 236, 492, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 237, 493, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 238, 494, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 239, 495, 65, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 240, 496, 66, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 241, 497, 67, 0x0, 0x3, 0x0, 0x0 +_v_pk_add_f16__vop3p 242, 498, 68, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 235, 491, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 236, 492, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 237, 493, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 238, 494, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v235 +v_and_b32 v28, 0x7fff7fff, v236 +v_and_b32 v38, 0x7fff7fff, v237 +v_and_b32 v48, 0x7fff7fff, v238 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 235, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 236, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 237, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 238, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 235, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 236, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 237, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 238, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v235, 0x7fff7fff, v23, v235 +v_bfi_b32 v236, 0x7fff7fff, v33, v236 +v_bfi_b32 v237, 0x7fff7fff, v43, v237 +v_bfi_b32 v238, 0x7fff7fff, v53, v238 +_v_pk_mul_f16__vop3p 235, 491, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 236, 492, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 237, 493, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 238, 494, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 239, 495, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 240, 496, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 241, 497, 33, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 242, 498, 33, 0x0, 0x3, 0x0, 0x0 +v_and_b32 v22, 0x7fff7fff, v239 +v_and_b32 v28, 0x7fff7fff, v240 +v_and_b32 v38, 0x7fff7fff, v241 +v_and_b32 v48, 0x7fff7fff, v242 +v_mov_b32 v23, 0xb5f8b5f8 +v_mov_b32 v33, 0xb5f8b5f8 +v_mov_b32 v43, 0xb5f8b5f8 +v_mov_b32 v53, 0xb5f8b5f8 +v_pk_fma_f16 v23, v22, 0x2ff12ff1, v23 +v_pk_fma_f16 v33, v28, 0x2ff12ff1, v33 +v_pk_fma_f16 v43, v38, 0x2ff12ff1, v43 +v_pk_fma_f16 v53, v48, 0x2ff12ff1, v53 +v_pk_fma_f16 v23, v22, v23, 0x1c571c57 +v_pk_fma_f16 v33, v28, v33, 0x1c571c57 +v_pk_fma_f16 v43, v38, v43, 0x1c571c57 +v_pk_fma_f16 v53, v48, v53, 0x1c571c57 +v_pk_fma_f16 v23, v22, v23, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v33, v28, v33, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v43, v38, v43, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +v_pk_fma_f16 v53, v48, v53, 1.0 op_sel:[0,0,0] op_sel_hi:[1,1,0] +_v_pk_mul_f16__vop3p 23, 278, 279, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 33, 284, 289, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 43, 294, 299, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 53, 304, 309, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p_lit 22, 0x41c541c5, 278, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 28, 0x41c541c5, 284, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 38, 0x41c541c5, 294, 0x0, 0x3 +_v_pk_mul_f16__vop3p_lit 48, 0x41c541c5, 304, 0x0, 0x3 +v_exp_f16 v22, v22 +v_exp_f16 v28, v28 +v_exp_f16 v38, v38 +v_exp_f16 v48, v48 +_v_exp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_exp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +_v_pk_add_f16__vop3p 22, 242, 278, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 28, 242, 284, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 38, 242, 294, 0x0, 0x2, 0x0, 0x0 +_v_pk_add_f16__vop3p 48, 242, 304, 0x0, 0x2, 0x0, 0x0 +v_rcp_f16 v22, v22 +v_rcp_f16 v28, v28 +v_rcp_f16 v38, v38 +v_rcp_f16 v48, v48 +_v_rcp_f16__vop1 (22 | /*op_sel*/ 0x80), (22 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (28 | /*op_sel*/ 0x80), (28 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (38 | /*op_sel*/ 0x80), (38 | /*op_sel*/ 0x80) +_v_rcp_f16__vop1 (48 | /*op_sel*/ 0x80), (48 | /*op_sel*/ 0x80) +v_pk_fma_f16 v22, v22, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v28, v28, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v38, v38, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +v_pk_fma_f16 v48, v48, 2.0, 1.0 op_sel:[0,0,0] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +_v_cmp_gt_f16__vop3_v_lit 106, 239, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v23, v23, v22, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 240, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v33, v33, v28, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 241, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v43, v43, v38, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 242, 0x38b838b8, 0x0, 0x1 +v_cndmask_b16 v53, v53, v48, vcc +_v_cmp_gt_f16__vop3_v_lit 106, 239, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 23, 23, 22, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 240, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 33, 33, 28, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 241, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 43, 43, 38, 106, 0xb +_v_cmp_gt_f16__vop3_v_lit 106, 242, 0x38b838b8, 0x3, 0x1 +_v_cndmask_b16__vop3 53, 53, 48, 106, 0xb +v_bfi_b32 v239, 0x7fff7fff, v23, v239 +v_bfi_b32 v240, 0x7fff7fff, v33, v240 +v_bfi_b32 v241, 0x7fff7fff, v43, v241 +v_bfi_b32 v242, 0x7fff7fff, v53, v242 +_v_pk_mul_f16__vop3p 239, 495, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 240, 496, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 241, 497, 32, 0x0, 0x3, 0x0, 0x0 +_v_pk_mul_f16__vop3p 242, 498, 32, 0x0, 0x3, 0x0, 0x0 +buffer_store_b16 v235, v245, s[72:75], 0 idxen +buffer_store_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v235, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v239, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v236, v245, s[72:75], 0 idxen +buffer_store_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v236, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v240, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v237, v245, s[72:75], 0 idxen +buffer_store_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v237, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v241, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_b16 v238, v245, s[72:75], 0 idxen +buffer_store_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +buffer_store_d16_hi_b16 v238, v245, s[72:75], 0 idxen +buffer_store_d16_hi_b16 v242, v246, s[72:75], 0 idxen +s_sub_u32 s69, s69, 1 +s_cselect_b32 s75, 0, s75 +s_add_u32 s72, s72, s80 +s_addc_u32 s73, s73, 0 +s_mov_b32 s84, 0x6394 +s_setpc_b64 s[86:87] +s_endpgm +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end + diff --git a/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_metadata.inc b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_metadata.inc new file mode 100644 index 0000000000..bd29eb5c30 --- /dev/null +++ b/src/kernels/winograd/Conv_Winograd_Fury_v2_4_1_metadata.inc @@ -0,0 +1,192 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +.macro PROLOG_KERNEL_DESCRIPTOR kernel_name:req +.text +.globl \kernel_name +.p2align 8 +.type \kernel_name,@function +\kernel_name: +.endm + +.macro METADATA sc:req, wc:req, wg_x:req, kernel_name:req +.amdgpu_metadata +--- +amdhsa.version: [ 1, 2 ] +amdhsa.kernels: + - .name: \kernel_name + .symbol: \kernel_name\().kd + .sgpr_count: \sc + .vgpr_count: \wc // Unused in the kernel descriptor + .group_segment_fixed_size: 65536 + .private_segment_fixed_size: 0 + .kernarg_segment_size: 232 + .kernarg_segment_align: 8 + .reqd_workgroup_size: [ \wg_x, 1, 1 ] + .max_flat_workgroup_size: \wg_x + .wavefront_size: 64 + .workgroup_processor_mode: 0 + .args: + - { .size: 4, .offset: 0, .value_kind: by_value, .name: N_ } + - { .size: 4, .offset: 4, .value_kind: by_value, .name: C } + - { .size: 4, .offset: 8, .value_kind: by_value, .name: H } + - { .size: 4, .offset: 12, .value_kind: by_value, .name: W } + + - { .size: 4, .offset: 16, .value_kind: by_value, .name: K } + - { .size: 4, .offset: 20, .value_kind: by_value, .name: n_groups } + + - { .size: 8, .offset: 24, .value_kind: by_value, .name: flags64 } + + - { .size: 8, .offset: 32, .value_kind: global_buffer, .name: data_addr, .address_space: global, .is_const: true } + - { .size: 8, .offset: 40, .value_kind: global_buffer, .name: filter_addr, .address_space: global, .is_const: true } + - { .size: 8, .offset: 48, .value_kind: global_buffer, .name: output_addr, .address_space: global, .is_const: false } + - { .size: 8, .offset: 56, .value_kind: by_value } + + - { .size: 4, .offset: 64, .value_kind: by_value, .name: R } + - { .size: 4, .offset: 68, .value_kind: by_value, .name: S } + - { .size: 4, .offset: 72, .value_kind: by_value, .name: pad_h } + - { .size: 4, .offset: 76, .value_kind: by_value, .name: pad_w } + - { .size: 4, .offset: 80, .value_kind: by_value, .name: out_h } + - { .size: 4, .offset: 84, .value_kind: by_value, .name: out_w } + + - { .size: 8, .offset: 88, .value_kind: global_buffer, .name: bias_addr, .address_space: global, .is_const: true } + - { .size: 4, .offset: 96, .value_kind: by_value, .name: alpha } + - { .size: 4, .offset: 100, .value_kind: by_value, .name: beta } + + - { .size: 8, .offset: 104, .value_kind: by_value, .name: d_offset } + - { .size: 8, .offset: 112, .value_kind: by_value, .name: f_offset } + - { .size: 8, .offset: 120, .value_kind: by_value, .name: o_offset } + - { .size: 8, .offset: 128, .value_kind: by_value, .name: b_offset } + + - { .size: 4, .offset: 136, .value_kind: by_value, .name: d_N_stride } + - { .size: 4, .offset: 140, .value_kind: by_value, .name: d_C_stride } + - { .size: 4, .offset: 144, .value_kind: by_value, .name: d_H_stride } + - { .size: 4, .offset: 148, .value_kind: by_value } + + - { .size: 4, .offset: 152, .value_kind: by_value, .name: f_K_stride } + - { .size: 4, .offset: 156, .value_kind: by_value, .name: f_C_stride } + - { .size: 4, .offset: 160, .value_kind: by_value, .name: f_R_stride } + - { .size: 4, .offset: 164, .value_kind: by_value } + + - { .size: 4, .offset: 168, .value_kind: by_value, .name: o_N_stride } + - { .size: 4, .offset: 172, .value_kind: by_value, .name: o_K_stride } + - { .size: 4, .offset: 176, .value_kind: by_value, .name: o_H_stride } + - { .size: 4, .offset: 180, .value_kind: by_value } + + - { .size: 4, .offset: 184, .value_kind: by_value, .name: G } + - { .size: 4, .offset: 188, .value_kind: by_value, .name: d_G_stride } + - { .size: 4, .offset: 192, .value_kind: by_value, .name: f_G_stride } + - { .size: 4, .offset: 196, .value_kind: by_value, .name: o_G_stride } + + - { .size: 1, .offset: 200, .value_kind: by_value, .name: activation_mode } + - { .size: 1, .offset: 201, .value_kind: by_value, .name: sync_limit } + - { .size: 1, .offset: 202, .value_kind: by_value, .name: sync_period } + - { .size: 1, .offset: 203, .value_kind: by_value } + + - { .size: 4, .offset: 204, .value_kind: by_value } + - { .size: 8, .offset: 208, .value_kind: global_buffer, .name: sync_addr, .address_space: global, .is_const: false } + + - { .size: 8, .offset: 216, .value_kind: global_buffer, .name: acc_addr, .address_space: global, .is_const: false } + - { .size: 8, .offset: 224, .value_kind: by_value, .name: a_offset } +... +.end_amdgpu_metadata +.endm // METADATA + +.altmacro +.macro METADATA_WRAPPER sc:req, wc:req, wg_x:req, kernel_name:req + METADATA %\sc, %\wc, %\wg_x, \kernel_name +.endm + +.macro kernel_end kernel_name:req +.Lfunc_end0: + .size \kernel_name, .Lfunc_end0 - \kernel_name +.endm + +.macro EPILOG_KERNEL_DESCRIPTOR kernel_name:req + +kernel_end \kernel_name + +.if (.amdgcn.gfx_generation_number == 11) + .if ((.amdgcn.gfx_generation_minor == 0 && (.amdgcn.gfx_generation_stepping == 0 || .amdgcn.gfx_generation_stepping == 1)) || (.amdgcn.gfx_generation_minor == 5 && .amdgcn.gfx_generation_stepping == 1)) + // gfx1100, gfx1101, gfx1151 + vgpr_cnt = 252 + .else + // gfx1102, gfx1103, gfx1150 + vgpr_cnt = 168 + .endif + sgpr_cnt = 0 // 128 SGPRs always allocated for gfx10-gfx11 + workgroup_size_x = 384 +.endif + +.amdgcn.next_free_sgpr = sgpr_cnt +.amdgcn.next_free_vgpr = vgpr_cnt + +__group_segment_fixed_size = 65536 +__sgpr_dispatch_ptr = 1 +__sgpr_kernarg_segment_ptr = 1 +__ieee_mode = 0 +__dx10_clamp = 0 + +.rodata +.p2align 6 +.if (.amdgcn.gfx_generation_number == 11) +.amdhsa_kernel \kernel_name + .amdhsa_group_segment_fixed_size __group_segment_fixed_size + .amdhsa_user_sgpr_dispatch_ptr __sgpr_dispatch_ptr // s[0:1] + .amdhsa_user_sgpr_kernarg_segment_ptr __sgpr_kernarg_segment_ptr // s[2:3] + .amdhsa_next_free_vgpr .amdgcn.next_free_vgpr + .amdhsa_next_free_sgpr .amdgcn.next_free_sgpr + .amdhsa_ieee_mode __ieee_mode + .amdhsa_dx10_clamp __dx10_clamp + .amdhsa_wavefront_size32 0 + .amdhsa_workgroup_processor_mode 0 +.end_amdhsa_kernel +.endif + +METADATA_WRAPPER sgpr_cnt, 0, workgroup_size_x, <\kernel_name> + +.endm + +.macro PROLOG_KERNEL_DESCRIPTOR_WRAPPER machine_version:req, kernel_name_postfix:req + PROLOG_KERNEL_DESCRIPTOR miopenSp3AsmConvFury_v2_4_1_gfx\machine_version\()\kernel_name_postfix +.endm + +.macro EPILOG_KERNEL_DESCRIPTOR_WRAPPER machine_version:req, kernel_name_postfix:req + EPILOG_KERNEL_DESCRIPTOR miopenSp3AsmConvFury_v2_4_1_gfx\machine_version\()\kernel_name_postfix +.endm + +.macro KERNEL_PROLOG kernel_name_postfix:req + PROLOG_KERNEL_DESCRIPTOR_WRAPPER %.amdgcn.gfx_generation_number, \kernel_name_postfix +.endm + +.macro KERNEL_EPILOG kernel_name_postfix:req + EPILOG_KERNEL_DESCRIPTOR_WRAPPER %.amdgcn.gfx_generation_number, \kernel_name_postfix +.endm + +.if (.amdgcn.gfx_generation_number != 11) + .error "Unsupported gfx generation" + .end +.endif diff --git a/src/layernorm.cpp b/src/layernorm.cpp new file mode 100644 index 0000000000..7d2789973f --- /dev/null +++ b/src/layernorm.cpp @@ -0,0 +1,83 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t LayerNormForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& biasDesc, + ConstData_t bias, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& meanDesc, + Data_t mean, + const TensorDescriptor& rstdDesc, + Data_t rstd, + miopenNormMode_t mode, + float epsilon, + int32_t normalized_dim) +{ + const auto problem = layernorm::ProblemDescription{ + mode, xDesc, weightDesc, biasDesc, yDesc, meanDesc, rstdDesc, epsilon, normalized_dim}; + + const auto invoke_params = [&]() { + auto tmp = layernorm::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.xDesc = &xDesc; + tmp.x = x; + tmp.weight = weight; + tmp.bias = bias; + tmp.y = y; + tmp.mean = mean; + tmp.rstd = rstd; + tmp.epsilon = epsilon; + tmp.normalized_dim = normalized_dim; + tmp.mode = mode; + return tmp; + }(); + + const auto algo = AlgorithmName{"LayerNormForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/layernorm/problem_description.cpp b/src/layernorm/problem_description.cpp index 0d56e98a8b..657b5c26ca 100644 --- a/src/layernorm/problem_description.cpp +++ b/src/layernorm/problem_description.cpp @@ -39,23 +39,39 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const size_t outer_size = 1; size_t inner_size = 1; - for(size_t i = 0ULL; i < dims.size(); ++i) + if((mode == MIOPEN_WEIGHT_BIAS_T5) || (mode == MIOPEN_ELEMENTWISE_AFFINE_T5)) { - if(i < normalized_dim) - outer_size *= dims[i]; - else - inner_size *= dims[i]; + inner_size = dims[dims.size() - 1]; + outer_size = std::accumulate(dims.begin(), dims.end() - 1, 1ULL, std::multiplies()); + } + else + { + outer_size = std::accumulate( + dims.begin(), dims.begin() + normalized_dim, 1ULL, std::multiplies()); + inner_size = std::accumulate( + dims.begin() + normalized_dim, dims.end(), 1ULL, std::multiplies()); } - auto dtype = xDesc.GetType(); std::ostringstream ss; ss << "dtype" << dtype; - ss << "normalized_dim" << normalized_dim; + if((mode == MIOPEN_WEIGHT_BIAS_T5) || (mode == MIOPEN_ELEMENTWISE_AFFINE_T5)) + { + ss << "normalized_dim" << dims.size() - 1; + } + else + { + ss << "normalized_dim" << normalized_dim; + } ss << "outer_size" << outer_size; ss << "inner_size" << inner_size; + if((mode == MIOPEN_WEIGHT_BIAS_FUSED_ADD) || (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD)) + ss << "addlayernorm"; + if((mode == MIOPEN_WEIGHT_BIAS_T5) || (mode == MIOPEN_ELEMENTWISE_AFFINE_T5)) + ss << "t5layernorm"; + return NetworkConfig{ss.str()}; } diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 55b450e7fc..1b80d0d729 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -181,7 +181,8 @@ static auto GetWindogradWrWSolvers() miopen::solver::conv::ConvWinograd3x3MultipassWrW<1, 1, 7, 2>, miopen::solver::conv::ConvWinograd3x3MultipassWrW<1, 1, 7, 3>, miopen::solver::conv::ConvWinograd3x3MultipassWrW<5, 3>, - miopen::solver::conv::ConvWinograd3x3MultipassWrW<5, 4>>{}; + miopen::solver::conv::ConvWinograd3x3MultipassWrW<5, 4>, + miopen::solver::conv::ConvWinoFuryRxS<2, 3>>{}; } static auto GetBwdWrW2DSolvers() diff --git a/src/nogpu/handle.cpp b/src/nogpu/handle.cpp index 91d16db3bc..fdadf5fae8 100644 --- a/src/nogpu/handle.cpp +++ b/src/nogpu/handle.cpp @@ -161,7 +161,7 @@ const std::vector& Handle::GetKernelsImpl(const std::string& algorithm, return this->impl->cache.GetKernels(algorithm, network_config); } -KernelInvoke Handle::Run(Kernel /* k */) const { return {}; } +KernelInvoke Handle::Run(Kernel /*k*/, bool /*coop_launch*/) const { return {}; } Program Handle::LoadProgram(const std::string& program_name, std::string params, @@ -250,6 +250,8 @@ std::size_t Handle::GetMaxMemoryAllocSize() return this->impl->max_mem_alloc_size; } +bool Handle::CooperativeLaunchSupported() const { return false; } + const TargetProperties& Handle::GetTargetProperties() const { return this->impl->target_properties; diff --git a/src/ocl/handleocl.cpp b/src/ocl/handleocl.cpp index d08edc4896..4b1d650ff1 100644 --- a/src/ocl/handleocl.cpp +++ b/src/ocl/handleocl.cpp @@ -366,8 +366,11 @@ const std::vector& Handle::GetKernelsImpl(const std::string& algorithm, return this->impl->cache.GetKernels(algorithm, network_config); } -KernelInvoke Handle::Run(Kernel k) const +KernelInvoke Handle::Run(Kernel k, bool coop_launch) const { + if(coop_launch) + MIOPEN_THROW(miopenStatusInternalError); + auto q = this->GetStream(); if(this->impl->enable_profiling || MIOPEN_GPU_SYNC) { @@ -480,6 +483,8 @@ std::size_t Handle::GetMaxMemoryAllocSize() return m_MaxMemoryAllocSizeCached; } +bool Handle::CooperativeLaunchSupported() const { return false; } + std::size_t Handle::GetMaxComputeUnits() const { return miopen::GetDeviceInfo(miopen::GetDevice(this->GetStream())); diff --git a/src/ocl/utilocl.cpp b/src/ocl/utilocl.cpp index 027e28975b..278906ec04 100644 --- a/src/ocl/utilocl.cpp +++ b/src/ocl/utilocl.cpp @@ -802,9 +802,9 @@ float transpose_NCHW2CNHW(const Handle& handle, auto&& kernels = handle.GetKernels(kernel_name, network_config); if(!kernels.empty()) { - auto kernel = kernels.front(); - kernel.ldims = {{vld[0], vld[1], vld[2]}}; - kernel.gdims = {{vgd[0], vgd[1], vgd[2]}}; + auto kernel = kernels.front(); + kernel.SetLocalDims(vld[0], vld[1], vld[2]); + kernel.SetGlobalDims(vgd[0], vgd[1], vgd[2]); kernel(in, out, in_offset, out_offset, RD_BLCK, HW_RD, n, c, h_in, w_in); } else @@ -843,9 +843,9 @@ float transpose_NCHW2CNHW(const Handle& handle, auto&& kernels = handle.GetKernels(kernel_name, network_config); if(!kernels.empty()) { - auto kernel = kernels.front(); - kernel.ldims = {{vld[0], vld[1], vld[2]}}; - kernel.gdims = {{vgd[0], vgd[1], vgd[2]}}; + auto kernel = kernels.front(); + kernel.SetLocalDims(vld[0], vld[1], vld[2]); + kernel.SetGlobalDims(vgd[0], vgd[1], vgd[2]); kernel(in, out, in_offset, @@ -935,9 +935,9 @@ float transpose_CNHW2NCHW(const Handle& handle, auto&& kernels = handle.GetKernels(kernel_name, network_config); if(!kernels.empty()) { - auto kernel = kernels.front(); - kernel.ldims = {{vld[0], vld[1], vld[2]}}; - kernel.gdims = {{vgd[0], vgd[1], vgd[2]}}; + auto kernel = kernels.front(); + kernel.SetLocalDims(vld[0], vld[1], vld[2]); + kernel.SetGlobalDims(vgd[0], vgd[1], vgd[2]); kernel(in, out, in_offset, out_offset, RD_BLCK, HW_RD, n, c, h_out, w_out); } else @@ -980,9 +980,9 @@ float transpose_CNHW2NCHW(const Handle& handle, auto&& kernels = handle.GetKernels(kernel_name, network_config); if(!kernels.empty()) { - auto kernel = kernels.front(); - kernel.ldims = {{vld[0], vld[1], vld[1]}}; - kernel.gdims = {{vgd[0], vgd[1], vgd[2]}}; + auto kernel = kernels.front(); + kernel.SetLocalDims(vld[0], vld[1], vld[2]); + kernel.SetGlobalDims(vgd[0], vgd[1], vgd[2]); kernel(in, out, in_offset, @@ -1177,14 +1177,13 @@ float transpose_packed_MN2NM(const Handle& handle, if(!kernels.empty()) { - auto kernel = kernels.front(); - kernel.ldims = {{vld[0], vld[1], vld[1]}}; - kernel.gdims = {{vgd[0], vgd[1], vgd[2]}}; + auto kernel = kernels.front(); + kernel.SetLocalDims(vld[0], vld[1], vld[2]); + kernel.SetGlobalDims(vgd[0], vgd[1], vgd[2]); kernel(in, out, n, m, in_offset, out_offset); } else { - handle.AddKernel(kernel_name, network_config, program_name, kernel_name, vld, vgd, params)( in, out, n, m, in_offset, out_offset); } diff --git a/src/reduce/problem_description.cpp b/src/reduce/problem_description.cpp index 7f6dd1b4fd..ac73d16a02 100644 --- a/src/reduce/problem_description.cpp +++ b/src/reduce/problem_description.cpp @@ -36,19 +36,27 @@ namespace reduce { NetworkConfig ProblemDescription::MakeNetworkConfig() const { auto xlength = xDesc.GetLengths(); - auto ylength = yDesc.GetLengths(); + std::vector outputlength; + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + outputlength = yDesc.GetLengths(); + else + outputlength = indiceDesc.GetLengths(); - auto reduce_size = xlength[dim]; - auto output_numel = std::accumulate( - ylength.begin(), ylength.end(), static_cast(1), std::multiplies()); - auto dtype = xDesc.GetType(); + auto size = xlength[dim]; + auto output_numel = std::accumulate(outputlength.begin(), + outputlength.end(), + static_cast(1), + std::multiplies()); + auto dtype = xDesc.GetType(); std::ostringstream ss; ss << "dtype" << dtype; ss << "dim" << dim; - ss << "reduce_size" << reduce_size; + ss << "size" << size; ss << "output_numel" << output_numel; + ss << "reduceExtremeOp" << reduceExtremeOp; return NetworkConfig{ss.str()}; } diff --git a/src/reduceextreme.cpp b/src/reduceextreme.cpp new file mode 100644 index 0000000000..3abb322fd8 --- /dev/null +++ b/src/reduceextreme.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t ReduceExtremeForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& indiceDesc, + Data_t indice, + int32_t dim, + miopenReduceExtremeOp_t reduceExtremeOp) +{ + if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMIN) + { + const auto problem = reduce::ProblemDescription{xDesc, indiceDesc, dim, reduceExtremeOp}; + + const auto invoke_params = [&]() { + auto tmp = reduce::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.xDesc = &xDesc; + tmp.indiceDesc = &indiceDesc; + tmp.x = x; + tmp.indice = indice; + tmp.dim = dim; + return tmp; + }(); + + const auto algo = AlgorithmName{"ArgminForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMAX) + { + const auto problem = reduce::ProblemDescription{xDesc, indiceDesc, dim, reduceExtremeOp}; + + const auto invoke_params = [&]() { + auto tmp = reduce::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.xDesc = &xDesc; + tmp.indiceDesc = &indiceDesc; + tmp.x = x; + tmp.indice = indice; + tmp.dim = dim; + return tmp; + }(); + + const auto algo = AlgorithmName{"ArgmaxForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; + } + + return miopenStatusUnsupportedOp; +} + +miopenStatus_t ReduceExtremeForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& indiceDesc, + Data_t indice, + int32_t dim, + miopenReduceExtremeOp_t reduceExtremeOp) +{ + if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) + { + const auto problem = + reduce::ProblemDescription{xDesc, yDesc, indiceDesc, dim, reduceExtremeOp}; + + const auto invoke_params = [&]() { + auto tmp = reduce::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.xDesc = &xDesc; + tmp.yDesc = &yDesc; + tmp.indiceDesc = &indiceDesc; + tmp.x = x; + tmp.y = y; + tmp.indice = indice; + tmp.dim = dim; + return tmp; + }(); + + const auto algo = AlgorithmName{"MinForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) + { + const auto problem = + reduce::ProblemDescription{xDesc, yDesc, indiceDesc, dim, reduceExtremeOp}; + + const auto invoke_params = [&]() { + auto tmp = reduce::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.xDesc = &xDesc; + tmp.yDesc = &yDesc; + tmp.indiceDesc = &indiceDesc; + tmp.x = x; + tmp.y = y; + tmp.indice = indice; + tmp.dim = dim; + return tmp; + }(); + + const auto algo = AlgorithmName{"MaxForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; + } + + return miopenStatusUnsupportedOp; +} + +} // namespace miopen diff --git a/src/reduceextreme_api.cpp b/src/reduceextreme_api.cpp new file mode 100644 index 0000000000..923d92c6ed --- /dev/null +++ b/src/reduceextreme_api.cpp @@ -0,0 +1,136 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +static void LogCmdReduceExtreme(const miopenTensorDescriptor_t xDesc, + const int32_t dim, + const miopenReduceExtremeOp_t reduceExtremeOp, + bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(xDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "reduceextremefp16"; + } + else if(dtype == miopenFloat) + { + ss << "reduceextremefp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "reduceextremebfp16"; + } + + int32_t size = {0}; + miopenGetTensorDescriptorSize(xDesc, &size); + ss << " -n " << miopen::deref(xDesc).GetLengths()[0]; + if(size == 5) + { + ss << " -c " << miopen::deref(xDesc).GetLengths()[1] << " -D " + << miopen::deref(xDesc).GetLengths()[2] << " -H " + << miopen::deref(xDesc).GetLengths()[3] << " -W " + << miopen::deref(xDesc).GetLengths()[4]; + } + else if(size == 4) + { + ss << " -c " << miopen::deref(xDesc).GetLengths()[1] << " -H " + << miopen::deref(xDesc).GetLengths()[2] << " -W " + << miopen::deref(xDesc).GetLengths()[3]; + } + else if(size == 3) + { + ss << " -c " << miopen::deref(xDesc).GetLengths()[1] << " -W " + << miopen::deref(xDesc).GetLengths()[2]; + } + else if(size == 2) + { + ss << " -c " << miopen::deref(xDesc).GetLengths()[1]; + } + + ss << " -F " << ((is_fwd) ? "1" : "2"); + + ss << " -R " << dim; + + ss << " -O " << reduceExtremeOp; + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenReduceExtremeForward(miopenHandle_t handle, + const miopenTensorDescriptor_t xDesc, + const void* x, + const int32_t dim, + const miopenReduceExtremeOp_t reduceExtremeOp, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t indiceDesc, + void* indice) +{ + + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMIN) || + reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMAX) + { + MIOPEN_LOG_FUNCTION(handle, xDesc, x, dim, reduceExtremeOp, indiceDesc, indice); + + LogCmdReduceExtreme(xDesc, dim, reduceExtremeOp, true); + + return miopen::try_([&] { + miopen::ReduceExtremeForward(miopen::deref(handle), + miopen::deref(xDesc), + DataCast(x), + miopen::deref(indiceDesc), + DataCast(indice), + dim, + reduceExtremeOp); + }); + } + else + { + MIOPEN_LOG_FUNCTION(handle, xDesc, x, dim, reduceExtremeOp, yDesc, y, indiceDesc, indice); + + LogCmdReduceExtreme(xDesc, dim, reduceExtremeOp, true); + return miopen::try_([&] { + miopen::ReduceExtremeForward(miopen::deref(handle), + miopen::deref(xDesc), + DataCast(x), + miopen::deref(yDesc), + DataCast(y), + miopen::deref(indiceDesc), + DataCast(indice), + dim, + reduceExtremeOp); + }); + } +} diff --git a/src/solver.cpp b/src/solver.cpp index f0560adfff..856ac443b0 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -27,7 +27,9 @@ #include #include +#include #include +#include #include #include #include @@ -648,9 +650,16 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); + Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId()); + Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId()); + Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId()); + Register(registry, ++id, Primitive::Mha, mha::MhaForward{}.SolverDbId()); Register(registry, ++id, Primitive::Mha, mha::MhaBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::Cat, cat::CatForward{}.SolverDbId()); + Register(registry, ++id, Primitive::Adam, adam::Adam{}.SolverDbId()); + // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/adam/adam.cpp b/src/solver/adam/adam.cpp new file mode 100644 index 0000000000..968c1a4f00 --- /dev/null +++ b/src/solver/adam/adam.cpp @@ -0,0 +1,220 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace adam { + +bool Adam::IsApplicable([[maybe_unused]] const ExecutionContext& context, + const miopen::adam::ProblemDescription& problem) const +{ + if(!problem.IsAllPacked()) + return false; + if(problem.IsAdamW()) + return false; + return true; +} + +ConvSolution Adam::GetSolution(const ExecutionContext& context, + const miopen::adam::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + { + auto param_dtype = miopen::GetDataType(problem.GetParamDesc().GetType()); + auto ptype_size = miopen::get_data_size(problem.GetParamDesc().GetType()); + auto grad_dtype = (problem.IsAmp() || problem.ExistStepTensor()) + ? miopen::GetDataType(problem.GetGradDesc().GetType()) + : "float"; + + const auto build_params = KernelBuildParameters{ + {"PTYPE", param_dtype}, + {"GTYPE", grad_dtype}, + {"CTYPE", ptype_size > 4 ? "double" : "float"}, + }; + + constexpr size_t local_size = 256; + auto& handle = context.GetStream(); + auto numCu = handle.GetMaxComputeUnits(); + auto grid_size = numCu * 4 * local_size; + + auto kernel = KernelInfo{}; + + kernel.l_wk.push_back(local_size); + kernel.g_wk.push_back(grid_size); + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.kernel_file = "MIOpenAdam.cpp"; + if(problem.ExistStepTensor()) + { + kernel.kernel_name = "AmpAdamPackedWithStep"; + } + else + { + kernel.kernel_name = problem.IsAmp() ? "AmpAdamPacked" : "AdamPacked"; + } + + result.construction_params.push_back(kernel); + + if(problem.ExistStepTensor()) + { + auto kernel_update_step = kernel; + kernel_update_step.kernel_name = "AdamUpdateStep"; + + result.construction_params.push_back(kernel_update_step); + } + } + + if(problem.ExistStepTensor()) + { + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel_adam = handle_.Run(kernels[0]); + decltype(auto) kernel_step = handle_.Run(kernels[1]); + decltype(auto) params = raw_params.CastTo(); + decltype(auto) numel = params.paramDesc->GetElementSize(); + auto elapsed = 0.f; + + kernel_adam(params.paramIn, + params.paramOut, + params.paramOutFloat16, + params.gradIn, + params.expAvgIn, + params.expAvgOut, + params.expAvgSqIn, + params.expAvgSqOut, + params.maxExpAvgSqIn, + params.maxExpAvgSqOut, + params.gradScale, + params.foundInf, + params.stepIn, + params.lr, + params.beta1, + params.beta2, + params.weight_decay, + params.eps, + params.amsgrad, + params.maximize, + numel); + + if(handle_.IsProfilingEnabled()) + elapsed = handle_.GetKernelTime(); + + kernel_step(params.foundInf, params.stepIn, params.stepOut); + + if(handle_.IsProfilingEnabled()) + { + elapsed += handle_.GetKernelTime(); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + } + }; + }; + } + else + { + if(problem.IsAmp()) + { + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + decltype(auto) numel = params.paramDesc->GetElementSize(); + + kernel(params.paramIn, + params.paramOut, + params.paramOutFloat16, + params.gradIn, + params.expAvgIn, + params.expAvgOut, + params.expAvgSqIn, + params.expAvgSqOut, + params.maxExpAvgSqIn, + params.maxExpAvgSqOut, + params.gradScale, + params.foundInf, + params.step, + params.lr, + params.beta1, + params.beta2, + params.weight_decay, + params.eps, + params.amsgrad, + params.maximize, + numel); + }; + }; + } + else + { + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + decltype(auto) numel = params.paramDesc->GetElementSize(); + + kernel(params.paramIn, + params.paramOut, + params.gradIn, + params.expAvgIn, + params.expAvgOut, + params.expAvgSqIn, + params.expAvgSqOut, + params.maxExpAvgSqIn, + params.maxExpAvgSqOut, + params.lr, + params.beta1, + params.beta2, + params.weight_decay, + params.eps, + params.step, + params.amsgrad, + params.maximize, + numel); + }; + }; + } + } + + return result; +} + +} // Namespace adam + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/conv/conv_wino_fury_RxS.cpp b/src/solver/conv/conv_wino_fury_RxS.cpp new file mode 100644 index 0000000000..9367e2737c --- /dev/null +++ b/src/solver/conv/conv_wino_fury_RxS.cpp @@ -0,0 +1,415 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#if !MIOPEN_USE_COMGR +#include +#endif +#include + +#define WORKAROUND_SWDEV_453577 1 + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2) + +#define IS2X3 (Winodata == 2 && Winofilter == 3) +#define IS3X2 (Winodata == 3 && Winofilter == 2) + +namespace miopen { +namespace solver { +namespace conv { + +using ProblemDescription = miopen::conv::ProblemDescription; +using WinoShaderArgsV2 = miopen::conv::WinoShaderArgsV2; +using WinoShaderActivationModeV2_t = miopen::conv::WinoShaderActivationModeV2_t; +using WinoShaderFlagsV2 = miopen::conv::WinoShaderFlagsV2; + +namespace { + +constexpr std::size_t sync_buffer_size = 2048; // 2K + +// Template is used to catch -Wshift-count-overflow +template +constexpr uint32_t PowOf2() +{ + return 1U << exp; +} + +// Divide two non-negative integers and return ceil of the quotient +constexpr uint64_t DivCeil(uint64_t numer, uint64_t denom) { return (numer + denom - 1) / denom; } + +constexpr uint64_t RoundUpToMultiple(uint64_t val, uint64_t mul) { return DivCeil(val, mul) * mul; } + +// Number of thread groups +uint32_t GetNGroups(uint64_t cu_count) +{ + // Current limitations: + // n_groups < 2^8 + constexpr uint64_t max_n_groups = PowOf2<8>() - 1; + + return std::min(cu_count, max_n_groups); +} + +bool IsShaderConstraintsMetV2(const WinoShaderArgsV2& args, uint32_t n_groups) +{ + // Current limitations: + // clang-format off + return args.N < PowOf2<16>() + && args.C < PowOf2<16>() + && args.H < PowOf2<16>() + && args.W < PowOf2<16>() + && args.pad_h >= std::numeric_limits::min() && args.pad_h <= std::numeric_limits::max() + && args.pad_w >= std::numeric_limits::min() && args.pad_w <= std::numeric_limits::max() + && args.out_h < PowOf2<16>() + && args.out_w < PowOf2<16>() - 3 + && args.R <= 3 + && args.S <= 3 + && (static_cast(args.N - 1) * args.C + 1) * args.H * args.W < PowOf2<31>() + && (static_cast(args.N - 1) * args.K + 1) * args.out_h * args.out_w < PowOf2<31>() + && DivCeil(args.K, 16) <= n_groups + && args.G == 1; + // clang-format on +} + +bool IsShaderConstraintsMet(const WinoShaderArgsV2& args, uint32_t n_groups) +{ + return IsShaderConstraintsMetV2(args, n_groups); +} + +bool GpuHasReducedVGPRMem(const std::string& dev_name) +{ + if(dev_name == "gfx1100" || dev_name == "gfx1101" || dev_name == "gfx1151") + return false; + return true; +} + +class ShaderModel +{ + const uint64_t N, C, K, R, S, oH, oW, G; + const uint64_t n_groups; + const uint32_t cu_count; + const bool reduced_vgpr; + + struct PerfModelInfo + { + uint64_t predicted_clk; + float granularity_loss; + }; + +public: + ShaderModel(const ExecutionContext& ctx, + const WinoShaderArgsV2& args, + uint32_t cu_cnt, + uint32_t n_grp, + bool reduced_vgpr_mem) + : N(args.N), + C(args.C), + K(args.K), + R(args.R), + S(args.S), + oH(args.out_h), + oW(args.out_w), + G(args.G), + n_groups(n_grp), + cu_count(cu_cnt), + reduced_vgpr(reduced_vgpr_mem) + { + std::ignore = ctx; + } + + bool IsC32ModePreferable() const + { + PerfModelInfo perf_model_c16, perf_model_c32; + perf_model_c16 = PerfPrediction(false); + perf_model_c32 = PerfPrediction(true); + return perf_model_c32.predicted_clk <= perf_model_c16.predicted_clk; + } + +private: + PerfModelInfo PerfPrediction(bool c32_mode) const + { + constexpr uint64_t t_R = 3; + constexpr uint64_t t_S = 3; + constexpr uint64_t t_oH = 2; + constexpr uint64_t t_oW = 2; + + constexpr uint64_t nhw_factor = 62; + constexpr uint64_t k_factor = 16; + const uint64_t c_factor = c32_mode ? 32 : 16; + constexpr uint64_t nhw_factor_g = RoundUpToMultiple(nhw_factor, 32); + + const uint64_t Rg = RoundUpToMultiple(R, t_R); + const uint64_t Sg = RoundUpToMultiple(S, t_S); + const uint64_t Cg = RoundUpToMultiple(C, c_factor); + const uint64_t Kg = RoundUpToMultiple(K, k_factor); + const uint64_t oHg = RoundUpToMultiple(oH, t_oH); + const uint64_t oWg = RoundUpToMultiple(oW, t_oW) + t_oW; + + const uint64_t c_loops = Cg / c_factor; + const uint64_t k_ways = Kg / k_factor; + + const uint64_t nkhw_per_work = k_factor * nhw_factor_g * t_oH * t_oW; + + const uint64_t nhw_tiles = N * DivCeil(oHg, t_oH) * DivCeil(oWg, t_oW); + const uint64_t n_groups_e = k_ways * (n_groups / k_ways); + const uint64_t n_works = k_ways * DivCeil(nhw_tiles, nhw_factor); + const uint64_t n_works_per_cu = + DivCeil(n_works, n_groups_e) * DivCeil(n_groups_e, cu_count); + + const uint64_t macsg = n_works_per_cu * cu_count * nkhw_per_work * Cg * Rg * Sg; + const uint64_t macs = N * G * K * C * oH * R * oW * S; + + PerfModelInfo out; + out.granularity_loss = static_cast(macsg - macs) / macsg; + + const uint64_t n_works_per_filter = reduced_vgpr ? 5 : 10; + const uint64_t f_relaods = c_loops == 1 ? 1 : DivCeil(n_works_per_cu, n_works_per_filter); + + const uint64_t ph_start = c32_mode ? 4 : 6; + const uint64_t ph_accum = n_works_per_cu * (c_loops - 1); + const uint64_t ph_activ = n_works_per_cu; + const uint64_t ph_filter = f_relaods * c_loops; + + // Constant parameters of the model valid for gfx1100. Values for other ASICs may be + // different, however as an approximate heuristic for choosing between C16 and C32 + // modes it would be enough. + const uint64_t clk_start = c32_mode ? 2600 : 1450; + const uint64_t clk_accum = c32_mode ? 2938 : 1645; + const uint64_t clk_activ = c32_mode ? 2989 : 1696; + const uint64_t clk_filter = c32_mode ? 2600 : 1450; + + out.predicted_clk = ph_start * clk_start + ph_accum * clk_accum + ph_activ * clk_activ + + ph_filter * clk_filter; + + return out; + } +}; + +} // namespace + +template +bool ConvWinoFuryRxS::IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const +{ + if constexpr(IS2X3) + { + if(miopen::IsDisabled(ENV(MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3))) + return false; + } + if constexpr(IS3X2) + { + if(miopen::IsDisabled(ENV(MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2))) + return false; + } + + if(!ctx.use_asm_kernels) + return false; + if(problem.IsTensorsCasted()) + return false; + if(!problem.IsFp16()) + return false; + if(problem.HasNonPackedTensors()) + return false; + + const auto dev_name = ctx.GetStream().GetDeviceName(); + // All gfx11 ASICs are supported + if(!StartsWith(dev_name, "gfx11")) + return false; + + if(!(problem.GetKernelStrideH() == 1 && problem.GetKernelStrideW() == 1)) + return false; + if(!(problem.GetDilationH() == 1 && problem.GetDilationW() == 1)) + return false; + + WinoShaderArgsV2 args; + if(!args.SetConvParams(problem)) + return false; + + const auto cu_count = ctx.GetStream().GetMaxHardwareComputeUnits(); + const auto n_groups = GetNGroups(cu_count); + + return IsShaderConstraintsMet(args, n_groups); +} + +template +float ConvWinoFuryRxS::GetWti(const ExecutionContext& ctx, + const ProblemDescription& problem) const +{ + std::ignore = ctx; + std::ignore = problem; + + return -2.0; // Unknown WTI +} + +template +size_t +ConvWinoFuryRxS::GetWorkspaceSize(const ExecutionContext& ctx, + const ProblemDescription& problem) const +{ + std::ignore = problem; + + const bool coop_launch = ctx.GetStream().CooperativeLaunchSupported(); + return coop_launch ? sync_buffer_size : 0; // 2KB buffer for global sync +} + +template +ConvSolution +ConvWinoFuryRxS::GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const +{ + const auto dev_name = ctx.GetStream().GetDeviceName(); + const auto cu_count = ctx.GetStream().GetMaxHardwareComputeUnits(); + const auto n_groups = GetNGroups(cu_count); + const bool reduced_vgpr_mem = GpuHasReducedVGPRMem(dev_name); +#if WORKAROUND_SWDEV_453577 + const bool coop_launch = false; +#else + const bool coop_launch = ctx.GetStream().CooperativeLaunchSupported(); +#endif + + constexpr size_t wg_size = 384; + + WinoShaderArgsV2 args; + // Main convolution parameters + if(!args.SetConvParams(problem)) + { + MIOPEN_THROW(miopenStatusInternalError); + } + + const auto shader_model = ShaderModel(ctx, args, cu_count, n_groups, reduced_vgpr_mem); + // For ASICs with redused VGPR memory we have only c16 kernel + const bool c32_mode = reduced_vgpr_mem ? false : shader_model.IsC32ModePreferable(); + + // Warning + static bool IsWarned = false; + if(!IsWarned) + { + if(cu_count != n_groups) + { + MIOPEN_LOG_WE(SolverDbId() + << ": GPU has " << cu_count << " CUs, but this solver supports max " + << n_groups << " and thus may show sub-optimal performance."); + } + IsWarned = true; + } + + // Kernel name & file + const std::string kernel_version = "_v2_4_1"; + std::string kernel_name = "miopenSp3AsmConvFury" + kernel_version; + std::string kernel_file = "Conv_Winograd_Fury" + kernel_version; + + if(StartsWith(dev_name, "gfx11")) + { + kernel_name += "_gfx11"; + kernel_name += reduced_vgpr_mem ? "_1024vgprs" : "_1536vgprs"; + } + else + { + MIOPEN_THROW(miopenStatusInternalError); + } + + std::string kernel_postfix; + + if(problem.IsFp16()) + { + kernel_postfix += "_fp16_fp16acc"; + } + else + { + MIOPEN_THROW(miopenStatusInternalError); + } + + kernel_postfix += IS2X3 ? "_f2x3" : "_f3x2"; + kernel_postfix += c32_mode ? "_c32" : "_c16"; + kernel_postfix += "_stride1"; + + kernel_name += kernel_postfix; + kernel_file += kernel_postfix + ".s"; + + // KernelInfo + KernelInfo kernel; + +#if !MIOPEN_USE_COMGR + KernelBuildParameters options{ + {"ROCM_METADATA_VERSION", 5}, // For AmdgcnAssemble(...) + }; + kernel.comp_options = options.GenerateFor(kbp::GcnAsm{}); +#endif + kernel.comp_options += std::string(" -mcumode -mwavefrontsize64"); + + kernel.l_wk.push_back(wg_size); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + + kernel.g_wk.push_back(wg_size * n_groups); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + kernel.kernel_file = kernel_file; + kernel.kernel_name = kernel_name; + + // Data layout related parameters + args.SetStrides(problem); + + // Fused activation parameters + args.SetActivParams(WinoShaderActivationModeV2_t::IDENTITY, 0.0f, 0.0f); + + // Other shader parameters + auto flags = WinoShaderFlagsV2::F_NKCHR_STRIDES | WinoShaderFlagsV2::F_TENSOR_OFFSETS | + WinoShaderFlagsV2::F_USE_ACTIVATION_MODE | + WinoShaderFlagsV2::F_USE_EXTENDED_FLAGS_64; + if(problem.IsDirectionBackwardData()) + flags |= WinoShaderFlagsV2::F_REVERSE_R | WinoShaderFlagsV2::F_REVERSE_S; + + uint8_t sync_limit = 0; + uint8_t sync_period = 0; + if(coop_launch) + { + sync_limit = 255; + sync_period = c32_mode ? 3 : 4; + } + args.SetShaderParams(n_groups, flags, sync_limit, sync_period); + + // Solution + ConvSolution result; + result.construction_params.push_back(kernel); + result.invoker_factory = miopen::conv::MakeGcnAsmWinoV2InvokerFactory( + args, problem.GetDirection(), coop_launch ? sync_buffer_size : 0); + result.workspace_sz = GetWorkspaceSize(ctx, problem); + + return result; +} + +template struct ConvWinoFuryRxS<2, 3>; +// template struct ConvWinoFuryRxS<3, 2>; + +} // namespace conv +} // namespace solver +} // namespace miopen diff --git a/src/solver/conv_winoRxS.cpp b/src/solver/conv_winoRxS.cpp index ed11fa9aaf..3d2b8f3439 100644 --- a/src/solver/conv_winoRxS.cpp +++ b/src/solver/conv_winoRxS.cpp @@ -434,6 +434,8 @@ ConvBinWinoRxS::Search(const ExecutionContext& ctx, return GenericSearch(*this, ctx, problem, invoke_ctx); } +namespace { + class ShaderModel : public UnifiedDescriptionConv2d { static constexpr size_t NHW_tiles_factor = 32; @@ -575,7 +577,7 @@ class ShaderModel : public UnifiedDescriptionConv2d n_works_per_CU = Ceil(n_works, n_groups) * Ceil(G * n_groups, n_CU); } - size_t GetNGroups() const noexcept { return n_groups; } + [[maybe_unused]] size_t GetNGroups() const noexcept { return n_groups; } double ComputeWti() const noexcept { @@ -636,9 +638,11 @@ class ShaderModel : public UnifiedDescriptionConv2d return WTI_predicted; } - double GetGranularityLoss() const { return granularity_loss; } + [[maybe_unused]] double GetGranularityLoss() const { return granularity_loss; } }; +} // namespace + template static float GetWtiBase(const ExecutionContext& ctx, const ProblemDescription& problem) { diff --git a/src/solver/layernorm/backward_t5layernorm.cpp b/src/solver/layernorm/backward_t5layernorm.cpp new file mode 100644 index 0000000000..c62a756b77 --- /dev/null +++ b/src/solver/layernorm/backward_t5layernorm.cpp @@ -0,0 +1,382 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace layernorm { + +bool T5LayernormBackward::IsApplicable(const ExecutionContext&, + const miopen::layernorm::ProblemDescription& problem) const +{ + if(!problem.IsSameType()) + return false; + if(!problem.IsSameLength()) + return false; + if(!problem.IsAllPacked()) + return false; + if(!(sizeof_local_memory_t5(problem) <= TargetProperties::GetMaxLocalMemorySize())) + return false; + return true; +} + +ConvSolution +T5LayernormBackward::GetSolution(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetDYDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetDYDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetDXDesc().GetType()); + auto dims = problem.GetDYDesc().GetLengths(); + + auto outer_size = + std::accumulate(dims.begin(), dims.end() - 1, 1ULL, std::multiplies()); + auto inner_size = dims[dims.size() - 1]; + + auto reqd_work_item_cnt = get_reqd_work_item_cnt(context); + + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = outer_size * xlocalsize; + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenLayerNorm.cpp"; + kernel.kernel_name = "T5LayernormBwdContiguous"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + {"MIOPEN_ELEMENTWISE_AFFINE", 0}, + {"MIOPEN_WEIGHT_BIAS", 1}, + {"MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD", 2}, + {"MIOPEN_WEIGHT_BIAS_FUSED_ADD", 3}, + {"MIOPEN_ELEMENTWISE_AFFINE_T5", 4}, + {"MIOPEN_WEIGHT_BIAS_T5", 5}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + if(is_parallelism(reqd_work_item_cnt, inner_size, outer_size)) + { + { + auto parallelism_size = + get_parallelism_size(reqd_work_item_cnt, inner_size, outer_size); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(parallelism_size * inner_size, xlocalsize); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenLayerNorm.cpp"; + kernel.kernel_name = "T5LayernormBwdWeightContiguousParallel"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + {"MIOPEN_ELEMENTWISE_AFFINE", 0}, + {"MIOPEN_WEIGHT_BIAS", 1}, + {"MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD", 2}, + {"MIOPEN_WEIGHT_BIAS_FUSED_ADD", 3}, + {"MIOPEN_ELEMENTWISE_AFFINE_T5", 4}, + {"MIOPEN_WEIGHT_BIAS_T5", 5}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(inner_size, LOCAL_SIZE); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenLayerNorm.cpp"; + kernel.kernel_name = "T5LayernormBwdContiguousReduceSum"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + {"MIOPEN_ELEMENTWISE_AFFINE", 0}, + {"MIOPEN_WEIGHT_BIAS", 1}, + {"MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD", 2}, + {"MIOPEN_WEIGHT_BIAS_FUSED_ADD", 3}, + {"MIOPEN_ELEMENTWISE_AFFINE_T5", 4}, + {"MIOPEN_WEIGHT_BIAS_T5", 5}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + } + else + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = inner_size; + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenLayerNorm.cpp"; + kernel.kernel_name = "T5LayernormBwdWeightContiguous"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + {"MIOPEN_ELEMENTWISE_AFFINE", 0}, + {"MIOPEN_WEIGHT_BIAS", 1}, + {"MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD", 2}, + {"MIOPEN_WEIGHT_BIAS_FUSED_ADD", 3}, + {"MIOPEN_ELEMENTWISE_AFFINE_T5", 4}, + {"MIOPEN_WEIGHT_BIAS_T5", 5}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + if(is_parallelism(reqd_work_item_cnt, inner_size, outer_size)) + { + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels[0]); + decltype(auto) weight_parallel_kernel = handle_.Run(kernels[1]); + decltype(auto) weight_kernel = handle_.Run(kernels[2]); + decltype(auto) params = raw_params.CastTo(); + + auto dims = params.dyDesc->GetLengths(); + + auto outer_size = + std::accumulate(dims.begin(), dims.end() - 1, 1ULL, std::multiplies()); + + auto inner_size = dims[dims.size() - 1]; + + auto reqd_work_item_cnt = get_reqd_work_item_cnt(handle_); + auto parallelism_size = + get_parallelism_size(reqd_work_item_cnt, inner_size, outer_size); + + auto elapsed = 0.f; + HipEventPtr start; + HipEventPtr stop; + + if(handle_.IsProfilingEnabled()) + { + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + kernel(params.dy, + params.x, + params.weight, + params.rstd, + params.dx, + inner_size, + static_cast(params.mode % 2)); + + weight_parallel_kernel(params.dy, + params.x, + params.rstd, + params.workspace, + outer_size, + inner_size, + parallelism_size); + + weight_kernel(params.workspace, params.dw, inner_size, parallelism_size); + + if(handle_.IsProfilingEnabled()) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + }; + }; + }; + } + else + { + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels[0]); + decltype(auto) weight_kernel = handle_.Run(kernels[1]); + decltype(auto) params = raw_params.CastTo(); + + auto dims = params.dyDesc->GetLengths(); + + auto outer_size = + std::accumulate(dims.begin(), dims.end() - 1, 1ULL, std::multiplies()); + + auto inner_size = dims[dims.size() - 1]; + + auto elapsed = 0.f; + HipEventPtr start; + HipEventPtr stop; + + if(handle_.IsProfilingEnabled()) + { + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + kernel(params.dy, + params.x, + params.weight, + params.rstd, + params.dx, + inner_size, + static_cast(params.mode % 2)); + + weight_kernel(params.dy, params.x, params.rstd, params.dw, outer_size, inner_size); + + if(handle_.IsProfilingEnabled()) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + }; + }; + }; + } + + return result; +} + +std::size_t +T5LayernormBackward::GetWorkspaceSize(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const +{ + auto dims = problem.GetDYDesc().GetLengths(); + + auto outer_size = + std::accumulate(dims.begin(), dims.end() - 1, 1ULL, std::multiplies()); + + auto inner_size = dims[dims.size() - 1]; + + auto reqd_work_item_cnt = get_reqd_work_item_cnt(context); + + if(is_parallelism(reqd_work_item_cnt, inner_size, outer_size)) + { + auto parallelism_size = get_parallelism_size(reqd_work_item_cnt, inner_size, outer_size); + + return parallelism_size * inner_size * get_data_size(problem.GetXDesc().GetType()); + } + + return 0; +} + +} // namespace layernorm + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/layernorm/forward_addlayernorm.cpp b/src/solver/layernorm/forward_addlayernorm.cpp new file mode 100644 index 0000000000..ba366b318d --- /dev/null +++ b/src/solver/layernorm/forward_addlayernorm.cpp @@ -0,0 +1,152 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace layernorm { + +bool AddLayernormForward::IsApplicable(const ExecutionContext&, + const miopen::layernorm::ProblemDescription& problem) const +{ + if(!problem.IsSameType()) + return false; + if(!problem.IsSameLength()) + return false; + if(!problem.IsAllPacked()) + return false; + if(!problem.IsRightNormDim()) + return false; + if(!(sizeof_local_memory(problem) <= TargetProperties::GetMaxLocalMemorySize())) + return false; + return true; +} + +ConvSolution +AddLayernormForward::GetSolution(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + + { + auto dtype = problem.GetXDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + auto dims = problem.GetXDesc().GetLengths(); + + size_t outer_size = 1; + for(size_t i = 0; i < problem.GetNormalizedDim(); i++) + { + outer_size *= dims[i]; + } + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = outer_size * xlocalsize; + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenLayerNorm.cpp"; + kernel.kernel_name = "AddLayernormFwdContiguous"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + {"MIOPEN_ELEMENTWISE_AFFINE", 0}, + {"MIOPEN_WEIGHT_BIAS", 1}, + {"MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD", 2}, + {"MIOPEN_WEIGHT_BIAS_FUSED_ADD", 3}, + {"MIOPEN_ELEMENTWISE_AFFINE_T5", 4}, + {"MIOPEN_WEIGHT_BIAS_T5", 5}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto dims = params.xDesc->GetLengths(); + size_t inner_size = 1; + + for(size_t i = params.normalized_dim; i < dims.size(); i++) + { + inner_size *= dims[i]; + } + + kernel(params.x, + params.x2, + params.weight, + params.bias, + params.y, + params.mean, + params.rstd, + params.epsilon, + inner_size, + static_cast(params.mode % 2)); + }; + }; + + return result; +} + +} // namespace layernorm + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/layernorm/forward_layernorm.cpp b/src/solver/layernorm/forward_layernorm.cpp index 0b625eb2fc..ffbe479f1f 100644 --- a/src/solver/layernorm/forward_layernorm.cpp +++ b/src/solver/layernorm/forward_layernorm.cpp @@ -24,12 +24,12 @@ * *******************************************************************************/ -#include - -#include #include -#include #include +#include +#include +#include +#include #include #define LOCAL_SIZE 256 @@ -40,19 +40,6 @@ namespace solver { namespace layernorm { -std::size_t sizeof_kernel_FLOAT(const miopen::layernorm::ProblemDescription& problem) -{ - const auto datatype = problem.GetXDesc().GetType(); - return get_data_size(datatype); -} - -std::size_t sizeof_local_memory(const miopen::layernorm::ProblemDescription& problem) -{ - std::size_t rv = 0; - rv += LOCAL_SIZE * sizeof_kernel_FLOAT(problem) * 2; - return rv; -} - bool LayernormForward::IsApplicable(const ExecutionContext&, const miopen::layernorm::ProblemDescription& problem) const { @@ -78,8 +65,10 @@ LayernormForward::GetSolution(const ExecutionContext& context, auto result = ConvSolution{miopenStatusSuccess}; { - auto dtype = problem.GetXDesc().GetType(); - auto dims = problem.GetXDesc().GetLengths(); + auto dtype = problem.GetXDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + auto dims = problem.GetXDesc().GetLengths(); size_t outer_size = 1; for(size_t i = 0; i < problem.GetNormalizedDim(); i++) @@ -102,9 +91,16 @@ LayernormForward::GetSolution(const ExecutionContext& context, const auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, - {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, {"LOCAL_SIZE", LOCAL_SIZE}, + {"MIOPEN_ELEMENTWISE_AFFINE", 0}, + {"MIOPEN_WEIGHT_BIAS", 1}, + {"MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD", 2}, + {"MIOPEN_WEIGHT_BIAS_FUSED_ADD", 3}, + {"MIOPEN_ELEMENTWISE_AFFINE_T5", 4}, + {"MIOPEN_WEIGHT_BIAS_T5", 5}, }; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); @@ -134,9 +130,9 @@ LayernormForward::GetSolution(const ExecutionContext& context, } kernel(params.x, - params.y, params.weight, params.bias, + params.y, params.mean, params.rstd, params.epsilon, diff --git a/src/solver/layernorm/forward_t5layernorm.cpp b/src/solver/layernorm/forward_t5layernorm.cpp new file mode 100644 index 0000000000..9729b426a8 --- /dev/null +++ b/src/solver/layernorm/forward_t5layernorm.cpp @@ -0,0 +1,139 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace layernorm { + +bool T5LayernormForward::IsApplicable(const ExecutionContext&, + const miopen::layernorm::ProblemDescription& problem) const +{ + if(!problem.IsSameType()) + return false; + if(!problem.IsSameLength()) + return false; + if(!problem.IsAllPacked()) + return false; + if(!(sizeof_local_memory_t5(problem) <= TargetProperties::GetMaxLocalMemorySize())) + return false; + return true; +} + +ConvSolution +T5LayernormForward::GetSolution(const ExecutionContext& context, + const miopen::layernorm::ProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + + { + auto dtype = problem.GetXDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + auto dims = problem.GetXDesc().GetLengths(); + + auto outer_size = + std::accumulate(dims.begin(), dims.end() - 1, 1ULL, std::multiplies()); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = outer_size * xlocalsize; + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenLayerNorm.cpp"; + kernel.kernel_name = "T5LayernormFwdContiguous"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + {"MIOPEN_ELEMENTWISE_AFFINE", 0}, + {"MIOPEN_WEIGHT_BIAS", 1}, + {"MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD", 2}, + {"MIOPEN_WEIGHT_BIAS_FUSED_ADD", 3}, + {"MIOPEN_ELEMENTWISE_AFFINE_T5", 4}, + {"MIOPEN_WEIGHT_BIAS_T5", 5}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto dims = params.xDesc->GetLengths(); + size_t inner_size = dims[dims.size() - 1]; + + kernel(params.x, + params.weight, + params.y, + params.rstd, + params.epsilon, + inner_size, + static_cast(params.mode % 2)); + }; + }; + + return result; +} + +} // namespace layernorm + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/reduce/forward_argmax.cpp b/src/solver/reduce/forward_argmax.cpp index ecdffa1ea5..4a44887ea4 100644 --- a/src/solver/reduce/forward_argmax.cpp +++ b/src/solver/reduce/forward_argmax.cpp @@ -24,7 +24,6 @@ * *******************************************************************************/ -#include #include #include #include @@ -39,19 +38,19 @@ namespace solver { namespace reduce { -size_t XGridSize(std::vector ydims) +size_t ArgmaxForward::XGridSize(std::vector indicedims) const { - auto output_numel = - std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); - return AlignUp(output_numel, LOCAL_SIZE); + auto indice_numel = + std::accumulate(indicedims.begin(), indicedims.end(), 1ULL, std::multiplies()); + return AlignUp(indice_numel, LOCAL_SIZE); } /// \todo https://github.com/ROCm/MIOpen/pull/2583#discussion_r1437054128 -bool OverMaxGridSize(const ExecutionContext& context, - const miopen::reduce::ProblemDescription& problem) +bool ArgmaxForward::OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const { - auto ydims = problem.GetYDesc().GetLengths(); - if(XGridSize(ydims) > context.GetStream().GetImage3dMaxWidth()) + auto indicedims = problem.GetIndiceDesc().GetLengths(); + if(XGridSize(indicedims) > context.GetStream().GetImage3dMaxWidth()) return false; return true; } @@ -59,11 +58,13 @@ bool OverMaxGridSize(const ExecutionContext& context, bool ArgmaxForward::IsApplicable(const ExecutionContext& context, const miopen::reduce::ProblemDescription& problem) const { - if(!problem.IsRightDim()) + if(!problem.IsValidDim()) return false; - if(!problem.IsRightLength()) + if(!problem.IsValidLengthIndice()) return false; - if(!problem.IsAllPacked()) + if(!problem.IsValidInputNumel()) + return false; + if(!problem.IsAllPackedIndice()) return false; if(!problem.IsNotLastDim()) return false; @@ -77,10 +78,11 @@ ConvSolution ArgmaxForward::GetSolution(const ExecutionContext&, { auto result = ConvSolution{miopenStatusSuccess}; + auto dtype = problem.GetXDesc().GetType(); auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); - auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + auto indice_dtype = miopen::GetDataType(problem.GetIndiceDesc().GetType()); auto xdims = problem.GetXDesc().GetLengths(); - auto ydims = problem.GetYDesc().GetLengths(); + auto indicedims = problem.GetIndiceDesc().GetLengths(); { size_t xlocalsize; @@ -92,15 +94,23 @@ ConvSolution ArgmaxForward::GetSolution(const ExecutionContext&, auto kernel = KernelInfo{}; - kernel.kernel_file = "MIOpenArgmax.cpp"; - kernel.kernel_name = "ArgmaxFwdContiguous"; + kernel.kernel_file = "MIOpenReduceExtreme.cpp"; + kernel.kernel_name = "ExtremeFwdContiguous"; xlocalsize = LOCAL_SIZE; - xgridsize = XGridSize(ydims); + xgridsize = XGridSize(indicedims); const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, - {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, - }; + {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"INDICE_TYPE", indice_dtype}, + {"OP_TYPE", "ReduceExtremeOp_t::Argmax"}, + {"MIOPEN_REDUCE_EXTREME_ARGMIN", MIOPEN_REDUCE_EXTREME_ARGMIN}, + {"MIOPEN_REDUCE_EXTREME_ARGMAX", MIOPEN_REDUCE_EXTREME_ARGMAX}, + {"MIOPEN_REDUCE_EXTREME_MIN", MIOPEN_REDUCE_EXTREME_MIN}, + {"MIOPEN_REDUCE_EXTREME_MAX", MIOPEN_REDUCE_EXTREME_MAX}}; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); @@ -120,18 +130,18 @@ ConvSolution ArgmaxForward::GetSolution(const ExecutionContext&, decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - auto xdims = params.xDesc->GetLengths(); - auto ydims = params.yDesc->GetLengths(); - auto dim = params.dim; + auto xdims = params.xDesc->GetLengths(); + auto indicedims = params.indiceDesc->GetLengths(); + auto dim = params.dim; int32_t reduce_size = static_cast(xdims[dim]); - auto output_numel = - std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); + auto indice_numel = std::accumulate( + indicedims.begin(), indicedims.end(), 1ULL, std::multiplies()); auto inner_size = std::accumulate( xdims.begin() + dim + 1, xdims.end(), 1ULL, std::multiplies()); - kernel(params.x, params.y, output_numel, reduce_size, inner_size); + kernel(params.x, nullptr, params.indice, indice_numel, reduce_size, inner_size); }; }; diff --git a/src/solver/reduce/forward_argmin.cpp b/src/solver/reduce/forward_argmin.cpp new file mode 100644 index 0000000000..c0b3d15aa0 --- /dev/null +++ b/src/solver/reduce/forward_argmin.cpp @@ -0,0 +1,155 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace reduce { + +size_t ArgminForward::XGridSize(std::vector indicedims) const +{ + auto indice_numel = + std::accumulate(indicedims.begin(), indicedims.end(), 1ULL, std::multiplies()); + return AlignUp(indice_numel, LOCAL_SIZE); +} + +/// \todo https://github.com/ROCm/MIOpen/pull/2583#discussion_r1437054128 +bool ArgminForward::OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const +{ + auto indicedims = problem.GetIndiceDesc().GetLengths(); + if(XGridSize(indicedims) > context.GetStream().GetImage3dMaxWidth()) + return false; + return true; +} + +bool ArgminForward::IsApplicable(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const +{ + if(!problem.IsValidDim()) + return false; + if(!problem.IsValidLengthIndice()) + return false; + if(!problem.IsValidInputNumel()) + return false; + if(!problem.IsAllPackedIndice()) + return false; + if(!problem.IsNotLastDim()) + return false; + if(!OverMaxGridSize(context, problem)) + return false; + return true; +} + +ConvSolution ArgminForward::GetSolution(const ExecutionContext&, + const miopen::reduce::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetXDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto indice_dtype = miopen::GetDataType(problem.GetIndiceDesc().GetType()); + auto xdims = problem.GetXDesc().GetLengths(); + auto indicedims = problem.GetIndiceDesc().GetLengths(); + + { + size_t xlocalsize; + size_t xgridsize; + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenReduceExtreme.cpp"; + kernel.kernel_name = "ExtremeFwdContiguous"; + xlocalsize = LOCAL_SIZE; + xgridsize = XGridSize(indicedims); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"INDICE_TYPE", indice_dtype}, + {"OP_TYPE", "ReduceExtremeOp_t::Argmin"}, + {"MIOPEN_REDUCE_EXTREME_ARGMIN", MIOPEN_REDUCE_EXTREME_ARGMIN}, + {"MIOPEN_REDUCE_EXTREME_ARGMAX", MIOPEN_REDUCE_EXTREME_ARGMAX}, + {"MIOPEN_REDUCE_EXTREME_MIN", MIOPEN_REDUCE_EXTREME_MIN}, + {"MIOPEN_REDUCE_EXTREME_MAX", MIOPEN_REDUCE_EXTREME_MAX}}; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto xdims = params.xDesc->GetLengths(); + auto indicedims = params.indiceDesc->GetLengths(); + auto dim = params.dim; + + int32_t reduce_size = static_cast(xdims[dim]); + auto indice_numel = std::accumulate( + indicedims.begin(), indicedims.end(), 1ULL, std::multiplies()); + + auto inner_size = std::accumulate( + xdims.begin() + dim + 1, xdims.end(), 1ULL, std::multiplies()); + + kernel(params.x, nullptr, params.indice, indice_numel, reduce_size, inner_size); + }; + }; + + return result; +} + +} // namespace reduce + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/reduce/forward_max.cpp b/src/solver/reduce/forward_max.cpp new file mode 100644 index 0000000000..9537c300cf --- /dev/null +++ b/src/solver/reduce/forward_max.cpp @@ -0,0 +1,156 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace reduce { + +size_t MaxForward::XGridSize(std::vector ydims) const +{ + auto output_numel = + std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); + return AlignUp(output_numel, LOCAL_SIZE); +} + +/// \todo https://github.com/ROCm/MIOpen/pull/2583#discussion_r1437054128 +bool MaxForward::OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const +{ + auto ydims = problem.GetYDesc().GetLengths(); + if(XGridSize(ydims) > context.GetStream().GetImage3dMaxWidth()) + return false; + return true; +} + +bool MaxForward::IsApplicable(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const +{ + if(!problem.IsValidDim()) + return false; + if(!problem.IsValidLength()) + return false; + if(!problem.IsAllPackedWithIndice()) + return false; + if(!problem.IsNotLastDim()) + return false; + if(!problem.IsLargeReduceSize()) + return false; + if(!OverMaxGridSize(context, problem)) + return false; + return true; +} + +ConvSolution MaxForward::GetSolution(const ExecutionContext&, + const miopen::reduce::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetXDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + auto indice_dtype = miopen::GetDataType(problem.GetIndiceDesc().GetType()); + auto xdims = problem.GetXDesc().GetLengths(); + auto ydims = problem.GetYDesc().GetLengths(); + + { + size_t xlocalsize; + size_t xgridsize; + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenReduceExtreme.cpp"; + kernel.kernel_name = "ExtremeFwdContiguous"; + xlocalsize = LOCAL_SIZE; + xgridsize = XGridSize(ydims); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"INDICE_TYPE", indice_dtype}, + {"OP_TYPE", "ReduceExtremeOp_t::Max"}, + {"MIOPEN_REDUCE_EXTREME_ARGMIN", MIOPEN_REDUCE_EXTREME_ARGMIN}, + {"MIOPEN_REDUCE_EXTREME_ARGMAX", MIOPEN_REDUCE_EXTREME_ARGMAX}, + {"MIOPEN_REDUCE_EXTREME_MIN", MIOPEN_REDUCE_EXTREME_MIN}, + {"MIOPEN_REDUCE_EXTREME_MAX", MIOPEN_REDUCE_EXTREME_MAX}}; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto xdims = params.xDesc->GetLengths(); + auto ydims = params.yDesc->GetLengths(); + auto dim = params.dim; + + int32_t reduce_size = static_cast(xdims[dim]); + auto output_numel = + std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); + + auto inner_size = std::accumulate( + xdims.begin() + dim + 1, xdims.end(), 1ULL, std::multiplies()); + + kernel(params.x, params.y, params.indice, output_numel, reduce_size, inner_size); + }; + }; + + return result; +} + +} // namespace reduce + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/reduce/forward_min.cpp b/src/solver/reduce/forward_min.cpp new file mode 100644 index 0000000000..f7aae43779 --- /dev/null +++ b/src/solver/reduce/forward_min.cpp @@ -0,0 +1,156 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace reduce { + +size_t MinForward::XGridSize(std::vector ydims) const +{ + auto output_numel = + std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); + return AlignUp(output_numel, LOCAL_SIZE); +} + +/// \todo https://github.com/ROCm/MIOpen/pull/2583#discussion_r1437054128 +bool MinForward::OverMaxGridSize(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const +{ + auto ydims = problem.GetYDesc().GetLengths(); + if(XGridSize(ydims) > context.GetStream().GetImage3dMaxWidth()) + return false; + return true; +} + +bool MinForward::IsApplicable(const ExecutionContext& context, + const miopen::reduce::ProblemDescription& problem) const +{ + if(!problem.IsValidDim()) + return false; + if(!problem.IsValidLength()) + return false; + if(!problem.IsAllPackedWithIndice()) + return false; + if(!problem.IsNotLastDim()) + return false; + if(!problem.IsLargeReduceSize()) + return false; + if(!OverMaxGridSize(context, problem)) + return false; + return true; +} + +ConvSolution MinForward::GetSolution(const ExecutionContext&, + const miopen::reduce::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetXDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + auto indice_dtype = miopen::GetDataType(problem.GetIndiceDesc().GetType()); + auto xdims = problem.GetXDesc().GetLengths(); + auto ydims = problem.GetYDesc().GetLengths(); + + { + size_t xlocalsize; + size_t xgridsize; + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenReduceExtreme.cpp"; + kernel.kernel_name = "ExtremeFwdContiguous"; + xlocalsize = LOCAL_SIZE; + xgridsize = XGridSize(ydims); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"INDICE_TYPE", indice_dtype}, + {"OP_TYPE", "ReduceExtremeOp_t::Min"}, + {"MIOPEN_REDUCE_EXTREME_ARGMIN", MIOPEN_REDUCE_EXTREME_ARGMIN}, + {"MIOPEN_REDUCE_EXTREME_ARGMAX", MIOPEN_REDUCE_EXTREME_ARGMAX}, + {"MIOPEN_REDUCE_EXTREME_MIN", MIOPEN_REDUCE_EXTREME_MIN}, + {"MIOPEN_REDUCE_EXTREME_MAX", MIOPEN_REDUCE_EXTREME_MAX}}; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto xdims = params.xDesc->GetLengths(); + auto ydims = params.yDesc->GetLengths(); + auto dim = params.dim; + + int32_t reduce_size = static_cast(xdims[dim]); + auto output_numel = + std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); + + auto inner_size = std::accumulate( + xdims.begin() + dim + 1, xdims.end(), 1ULL, std::multiplies()); + + kernel(params.x, params.y, params.indice, output_numel, reduce_size, inner_size); + }; + }; + + return result; +} + +} // namespace reduce + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/reduce/forward_sum.cpp b/src/solver/reduce/forward_sum.cpp index be1b7cb9fe..987f30e6b5 100644 --- a/src/solver/reduce/forward_sum.cpp +++ b/src/solver/reduce/forward_sum.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -39,35 +40,6 @@ namespace solver { namespace reduce { -size_t get_reqd_work_item_cnt(const ExecutionContext& context) -{ - // At least 4 WGs per one CU - return static_cast(LOCAL_SIZE * context.GetStream().GetMaxComputeUnits() * 4); -} - -size_t get_reqd_work_item_cnt(const Handle& handle) -{ - // At least 4 WGs per one CU - return static_cast(LOCAL_SIZE * handle.GetMaxComputeUnits() * 4); -} - -size_t get_parallelism_size(size_t reqd_work_item_cnt, size_t output_numel, size_t reduce_size) -{ - size_t parallelism_size = 1ULL; - while(parallelism_size * output_numel < reqd_work_item_cnt && - parallelism_size < std::sqrt(reduce_size)) - { - parallelism_size *= 2ULL; - } - return parallelism_size; -} - -bool is_parallelism(size_t reqd_work_item_cnt, size_t output_numel, size_t reduce_size) -{ - return !(output_numel > reqd_work_item_cnt) && - (output_numel * reduce_size > reqd_work_item_cnt); -} - bool IsImprovementOverROCm(const ExecutionContext& context, const miopen::reduce::ProblemDescription& problem) { @@ -101,9 +73,9 @@ bool SumForward::IsApplicable(const ExecutionContext& context, { if(!problem.IsSameType()) return false; - if(!problem.IsRightDim()) + if(!problem.IsValidDim()) return false; - if(!problem.IsRightLength()) + if(!problem.IsValidLength()) return false; if(!problem.IsAllPacked()) return false; diff --git a/src/t5layernorm.cpp b/src/t5layernorm.cpp new file mode 100644 index 0000000000..680270c4b0 --- /dev/null +++ b/src/t5layernorm.cpp @@ -0,0 +1,138 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t T5LayerNormForward(Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& rstdDesc, + Data_t rstd, + miopenNormMode_t mode, + float epsilon) +{ + const auto problem = + layernorm::ProblemDescription{mode, xDesc, weightDesc, yDesc, rstdDesc, epsilon}; + + const auto invoke_params = [&]() { + auto tmp = layernorm::T5InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.xDesc = &xDesc; + tmp.x = x; + tmp.weight = weight; + tmp.y = y; + tmp.rstd = rstd; + tmp.epsilon = epsilon; + tmp.mode = mode; + return tmp; + }(); + + const auto algo = AlgorithmName{"T5LayerNormForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +std::size_t GetT5LayerNormBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& dyDesc, + const TensorDescriptor& xDesc, + const TensorDescriptor& weightDesc, + const TensorDescriptor& rstdDesc, + const TensorDescriptor& dxDesc, + const TensorDescriptor& dwDesc, + miopenNormMode_t mode) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = + layernorm::ProblemDescription{mode, dyDesc, xDesc, weightDesc, rstdDesc, dxDesc, dwDesc}; + + const auto algo = AlgorithmName{"T5LayerNormBackward"}; + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t T5LayerNormBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& dyDesc, + ConstData_t dy, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& rstdDesc, + ConstData_t rstd, + const TensorDescriptor& dxDesc, + Data_t dx, + const TensorDescriptor& dwDesc, + Data_t dw, + miopenNormMode_t mode) +{ + const auto problem = + layernorm::ProblemDescription{mode, dyDesc, xDesc, weightDesc, rstdDesc, dxDesc, dwDesc}; + + const auto invoke_params = [&]() { + auto tmp = layernorm::T5BwdInvokeParams{}; + tmp.type = InvokeType::Run; + tmp.dyDesc = &dyDesc; + tmp.workspace = workspace; + tmp.workspace_size = workspaceSizeInBytes; + tmp.dy = dy; + tmp.x = x; + tmp.weight = weight; + tmp.rstd = rstd; + tmp.dx = dx; + tmp.dw = dw; + tmp.mode = mode; + return tmp; + }(); + + const auto algo = AlgorithmName{"T5LayerNormBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/t5layernorm_api.cpp b/src/t5layernorm_api.cpp new file mode 100644 index 0000000000..906d39b974 --- /dev/null +++ b/src/t5layernorm_api.cpp @@ -0,0 +1,188 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +static void +LogCmdT5LayerNorm(const miopenTensorDescriptor_t Desc, const miopenNormMode_t mode, bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(Desc).GetType(); + if(dtype == miopenHalf) + { + ss << "t5layernormfp16"; + } + else if(dtype == miopenFloat) + { + ss << "t5layernormfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "t5layernormbfp16"; + } + + int32_t size = {0}; + miopenGetTensorDescriptorSize(Desc, &size); + ss << " -n " << miopen::deref(Desc).GetLengths()[0] << " -c " + << miopen::deref(Desc).GetLengths()[1]; + if(size == 5) + { + ss << " -D " << miopen::deref(Desc).GetLengths()[2] << " -H " + << miopen::deref(Desc).GetLengths()[3] << " -W " + << miopen::deref(Desc).GetLengths()[4]; + } + else if(size == 4) + { + ss << " -H " << miopen::deref(Desc).GetLengths()[2] << " -W " + << miopen::deref(Desc).GetLengths()[3]; + } + else if(size == 3) + { + ss << " -W " << miopen::deref(Desc).GetLengths()[2]; + } + + ss << " -F " << ((is_fwd) ? "1" : "2") << " -m " << mode; + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenT5LayerNormForward(miopenHandle_t handle, + miopenNormMode_t mode, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const float epsilon, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t rstdDesc, + void* rstd) +{ + MIOPEN_LOG_FUNCTION( + handle, mode, xDesc, x, weightDesc, weight, epsilon, yDesc, y, rstdDesc, rstd); + + LogCmdT5LayerNorm(xDesc, mode, true); + return miopen::try_([&] { + miopen::T5LayerNormForward(miopen::deref(handle), + miopen::deref(xDesc), + DataCast(x), + miopen::deref(weightDesc), + DataCast(weight), + miopen::deref(yDesc), + DataCast(y), + miopen::deref(rstdDesc), + DataCast(rstd), + mode, + epsilon); + }); +} + +extern "C" miopenStatus_t +miopenGetT5LayerNormBackwardWorkspaceSize(miopenHandle_t handle, + miopenNormMode_t mode, + const miopenTensorDescriptor_t dyDesc, + const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t weightDesc, + const miopenTensorDescriptor_t rstdDesc, + const miopenTensorDescriptor_t dxDesc, + const miopenTensorDescriptor_t dwDesc, + size_t* sizeInBytes) +{ + MIOPEN_LOG_FUNCTION(handle, mode, dyDesc, xDesc, weightDesc, rstdDesc, dxDesc, dwDesc); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = + miopen::GetT5LayerNormBackwardWorkspaceSize(miopen::deref(handle), + miopen::deref(dyDesc), + miopen::deref(xDesc), + miopen::deref(weightDesc), + miopen::deref(rstdDesc), + miopen::deref(dxDesc), + miopen::deref(dwDesc), + mode); + }); +}; + +extern "C" miopenStatus_t miopenT5LayerNormBackward(miopenHandle_t handle, + miopenNormMode_t mode, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t rstdDesc, + const void* rstd, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t dwDesc, + void* dw) +{ + MIOPEN_LOG_FUNCTION(handle, + mode, + workspace, + workspaceSizeInBytes, + dyDesc, + dy, + xDesc, + x, + weightDesc, + weight, + rstdDesc, + rstd, + dxDesc, + dx, + dw); + + LogCmdT5LayerNorm(dyDesc, mode, true); + return miopen::try_([&] { + miopen::T5LayerNormBackward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(dyDesc), + DataCast(dy), + miopen::deref(xDesc), + DataCast(x), + miopen::deref(weightDesc), + DataCast(weight), + miopen::deref(rstdDesc), + DataCast(rstd), + miopen::deref(dxDesc), + DataCast(dx), + miopen::deref(dwDesc), + DataCast(dw), + mode); + }); +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ff3e244b5f..4034a61619 100755 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -306,7 +306,7 @@ function(add_test_command NAME EXE) add_test(NAME ${NAME} COMMAND echo skipped) set_tests_properties(${NAME} PROPERTIES DISABLED On) elseif(WIN32) - add_test(NAME ${NAME} COMMAND $ ${ARGN}) + add_test(NAME ${NAME} COMMAND $ ${ARGN} WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR}") else() if(MIOPEN_TEST_GDB) file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/test_${NAME}.cmake" diff --git a/test/cpu_adam.hpp b/test/cpu_adam.hpp new file mode 100644 index 0000000000..33a3b77fd3 --- /dev/null +++ b/test/cpu_adam.hpp @@ -0,0 +1,95 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_ADAM_HPP +#define GUARD_CPU_ADAM_HPP + +#include "tensor_holder.hpp" + +template +void cpu_adam(tensor& params, + tensor& grads, + tensor& exp_avgs, + tensor& exp_avg_sqs, + tensor& max_exp_avg_sqs, + float lr, + float beta1, + float beta2, + float weight_decay, + float eps, + bool amsgrad, + bool maximize, + bool is_amp, + int32_t grad_scale, + bool found_inf, + int32_t step_count) +{ + if(is_amp && found_inf) + return; + + par_ford(params.GetSize())([&](int32_t i) { + T1 param = params[i]; + T1 exp_avg = exp_avgs[i]; + T1 exp_avg_sq = exp_avg_sqs[i]; + T1 max_exp_avg_sq = amsgrad ? max_exp_avg_sqs[i] : 0; + + for(int step = 1; step <= step_count; step++) + { + T1 grad = grads[i]; + if(maximize) + grad *= -1; + if(is_amp) + grad /= grad_scale; + + float bias_correction1 = 1 - pow(beta1, step); + float bias_correction2 = 1 - pow(beta2, step); + + if(weight_decay != 0) + grad += param * weight_decay; + + exp_avg = exp_avg * beta1 + grad * (1 - beta1); + exp_avg_sq = exp_avg_sq * beta2 + grad * grad * (1 - beta2); + + float denom = 0; + if(amsgrad) + { + if(exp_avg_sq > max_exp_avg_sq) + max_exp_avg_sq = exp_avg_sq; + + denom = sqrt(max_exp_avg_sq) / sqrt(bias_correction2) + eps; + } + else + { + denom = sqrt(exp_avg_sq) / sqrt(bias_correction2) + eps; + } + + param = param - (lr / bias_correction1) * exp_avg / denom; + } + + params[i] = param; + }); +} + +#endif diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index e944f29868..1125047331 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -28,11 +28,11 @@ function(add_gtest TEST_NAME TEST_CPP) if(NOT MIOPEN_EMBED_DB STREQUAL "") target_link_libraries(${TEST_NAME} $) endif() - if(NOT WIN32 AND MIOPEN_TEST_DISCRETE) # TODO: cannot run on Windows due to missing DLL dependencies + if(MIOPEN_TEST_DISCRETE) # Enable CMake to discover the test binary - gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST PROPERTIES ENVIRONMENT "MIOPEN_USER_DB_PATH=${CMAKE_CURRENT_BINARY_DIR};MIOPEN_TEST_FLOAT_ARG=${MIOPEN_TEST_FLOAT_ARG};MIOPEN_TEST_ALL=${MIOPEN_TEST_ALL};MIOPEN_TEST_MLIR=${MIOPEN_TEST_MLIR};MIOPEN_TEST_COMPOSABLEKERNEL=${MIOPEN_TEST_COMPOSABLEKERNEL};CODECOV_TEST=${CODECOV_TEST};MIOPEN_TEST_DBSYNC=${MIOPEN_TEST_DBSYNC};MIOPEN_TEST_CONV=${MIOPEN_TEST_CONV};MIOPEN_TEST_DEEPBENCH=${MIOPEN_TEST_DEEPBENCH};MIOPEN_DEBUG_TUNING_ITERATIONS_MAX=${MIOPEN_DEBUG_TUNING_ITERATIONS_MAX}") + gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR} PROPERTIES ENVIRONMENT "MIOPEN_USER_DB_PATH=${CMAKE_CURRENT_BINARY_DIR};MIOPEN_TEST_FLOAT_ARG=${MIOPEN_TEST_FLOAT_ARG};MIOPEN_TEST_ALL=${MIOPEN_TEST_ALL};MIOPEN_TEST_MLIR=${MIOPEN_TEST_MLIR};MIOPEN_TEST_COMPOSABLEKERNEL=${MIOPEN_TEST_COMPOSABLEKERNEL};CODECOV_TEST=${CODECOV_TEST};MIOPEN_TEST_DBSYNC=${MIOPEN_TEST_DBSYNC};MIOPEN_TEST_CONV=${MIOPEN_TEST_CONV};MIOPEN_TEST_DEEPBENCH=${MIOPEN_TEST_DEEPBENCH};MIOPEN_DEBUG_TUNING_ITERATIONS_MAX=${MIOPEN_DEBUG_TUNING_ITERATIONS_MAX}") endif() - target_link_libraries(${TEST_NAME} BZip2::BZip2) + target_link_libraries(${TEST_NAME} BZip2::BZip2) if(WIN32) # Refer to https://en.cppreference.com/w/cpp/language/types for details. target_compile_options(${TEST_NAME} PRIVATE $:-U__LP64__>>) diff --git a/test/gtest/adam.cpp b/test/gtest/adam.cpp new file mode 100644 index 0000000000..f2c26aee6a --- /dev/null +++ b/test/gtest/adam.cpp @@ -0,0 +1,82 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include "adam.hpp" + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace adam { + +std::string GetFloatArg() +{ + const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct AdamTestFloat : AdamTest +{ +}; + +struct AmpAdamTestFloat : AdamTest +{ +}; + +} // namespace adam +using namespace adam; + +TEST_P(AdamTestFloat, AdamTestFw) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(AmpAdamTestFloat, AmpAdamTestFw) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(AdamTestSet, AdamTestFloat, testing::ValuesIn(AdamTestConfigs())); +INSTANTIATE_TEST_SUITE_P(AdamTestSet, AmpAdamTestFloat, testing::ValuesIn(AdamTestConfigs())); diff --git a/test/gtest/adam.hpp b/test/gtest/adam.hpp new file mode 100644 index 0000000000..152bc7a789 --- /dev/null +++ b/test/gtest/adam.hpp @@ -0,0 +1,289 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#define MIOPEN_BETA_API 1 +#include "../driver/tensor_driver.hpp" +#include "cpu_adam.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include + +struct AdamTestCase +{ + std::vector input; + float lr; + float beta1; + float beta2; + float weight_decay; + float eps; + bool amsgrad; + bool maximize; + bool use_step_tensor; + + friend std::ostream& operator<<(std::ostream& os, const AdamTestCase& tc) + { + os << " input:" << tc.input[0]; + for(int i = 1; i < tc.input.size(); i++) + { + os << "x" << tc.input[i]; + } + return os << " lr:" << tc.lr << " beta1:" << tc.beta1 << " beta2:" << tc.beta2 + << " weight_decay:" << tc.weight_decay << " eps:" << tc.eps + << " amsgrad:" << tc.amsgrad << " maximize:" << tc.maximize; + } + + const std::vector& GetInput() { return input; } +}; + +std::vector AdamTestConfigs() +{ // dim, dims + // clang-format off + std::vector base_shape{ + {{1}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{2}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{255}, 0.001, 0.9, 0.999, 0.0005, 0.000001, false, false, false}, + {{1024}, 0.001, 0.9, 0.999, 1e-08, 0.000001, false, false, false}, + {{32317}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{50000}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{29,1024}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{80,1536}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{128,1024}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{3706,32}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{32,1,41,11}, 0.001, 0.9, 0.999, 0, 0.000001, false, false, false}, + {{32,64,3,3}, 0.001, 0.9, 0.999, 0.005, 0.000001, false, false, false}, + {{64,256,3,3}, 0.001, 0.9, 0.999, 0.005, 0.000001, false, false, false}, + {{128,192,1,1}, 0.001, 0.9, 0.999, 0.0005, 0.000001, false, false, false}, + {{128,1024,1,1}, 0.001, 0.9, 0.999, 0.005, 0.000001, false, false, false}, + {{192,192,3,3}, 0.001, 0.9, 0.999, 0.0005, 0.000001, false, false, false}, + {{255,640,1,1}, 0.001, 0.9, 0.999, 0.0005, 0.000001, false, false, false}, + {{256,512,3,3}, 0.001, 0.9, 0.999, 0.005, 0.000001, false, false, false}}; + // clang-format on + std::vector result; + result.reserve(base_shape.size() * 8); + + for(auto& item : base_shape) + { + for(int i = 0; i <= 1; ++i) + { + for(int j = 0; j <= 1; ++j) + { + for(int k = 0; k <= 1; ++k) + { + item.use_step_tensor = static_cast(i); + item.amsgrad = static_cast(j); + item.maximize = static_cast(k); + result.push_back(item); + } + } + } + } + return result; +} + +template +struct AdamTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + adam_config = GetParam(); + auto gen_value = [](auto...) { return prng::gen_descreet_unsigned(1e-2, 100); }; + auto gen_zero = [](auto...) { return 0; }; + auto dims = adam_config.GetInput(); + + lr = adam_config.lr; + beta1 = adam_config.beta1; + beta2 = adam_config.beta2; + weight_decay = adam_config.weight_decay; + eps = adam_config.eps; + amsgrad = adam_config.amsgrad; + maximize = adam_config.maximize; + use_step_tensor = adam_config.use_step_tensor; + + param = tensor{dims}.generate(gen_value); + grad = tensor{dims}.generate(gen_value); + exp_avg = tensor{dims}.generate(gen_zero); + exp_avg_sq = tensor{dims}.generate(gen_zero); + ref_param = tensor{param}; + + param_dev = handle.Write(param.data); + grad_dev = handle.Write(grad.data); + exp_avg_dev = handle.Write(exp_avg.data); + exp_avg_sq_dev = handle.Write(exp_avg_sq.data); + + if(amsgrad) + { + max_exp_avg_sq = tensor{dims}.generate(gen_zero); + max_exp_avg_sq_dev = handle.Write(max_exp_avg_sq.data); + } + + if(use_step_tensor) + { + step[0] = 0; + step_dev = handle.Write(step.data); + } + + if(is_amp) + { + param_fp16 = tensor{dims}; + std::fill(param_fp16.begin(), + param_fp16.end(), + std::numeric_limits::quiet_NaN()); + param_fp16_dev = handle.Write(param_fp16.data); + + grad_scale[0] = 1024; + found_inf[0] = 0; + + grad_scale_dev = handle.Write(grad_scale.data); + found_inf_dev = handle.Write(found_inf.data); + } + else + { + grad_scale[0] = 1.0f; + found_inf[0] = 0; + } + } + + void RunTest() + { + const miopen::TensorDescriptor emptyDesc; + auto&& handle = get_handle(); + + cpu_adam(ref_param, + grad, + exp_avg, + exp_avg_sq, + max_exp_avg_sq, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + is_amp, + grad_scale[0], + found_inf[0], + step_count); + + for(uint32_t i = 1; i <= step_count; i++) + { + auto status = miopen::Adam(handle, + param.desc, + param_dev.get(), + param.desc, + param_dev.get(), + param_fp16.desc, + param_fp16_dev.get(), + grad.desc, + grad_dev.get(), + exp_avg.desc, + exp_avg_dev.get(), + exp_avg.desc, + exp_avg_dev.get(), + exp_avg_sq.desc, + exp_avg_sq_dev.get(), + exp_avg_sq.desc, + exp_avg_sq_dev.get(), + max_exp_avg_sq.desc, + max_exp_avg_sq_dev.get(), + max_exp_avg_sq.desc, + max_exp_avg_sq_dev.get(), + grad_scale.desc, + grad_scale_dev.get(), + found_inf.desc, + found_inf_dev.get(), + use_step_tensor ? step.desc : emptyDesc, + step_dev.get(), + use_step_tensor ? step.desc : emptyDesc, + step_dev.get(), + i, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + false, // adamw + is_amp); + + EXPECT_EQ(status, miopenStatusSuccess); + } + + param.data = handle.Read(param_dev, param.data.size()); + + if(is_amp) + param_fp16.data = handle.Read(param_fp16_dev, param_fp16.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + auto error = miopen::rms_range(ref_param, param); + + EXPECT_TRUE(miopen::range_distance(ref_param) == miopen::range_distance(param)); + EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error:" << error + << ", Thresholdx10: " << threshold * 10; + } + + AdamTestCase adam_config; + + tensor param; + tensor param_fp16; + tensor ref_param; + tensor grad; + tensor exp_avg; + tensor exp_avg_sq; + tensor max_exp_avg_sq; + tensor step{1}; + tensor found_inf{1}; + tensor grad_scale{1}; + + miopen::Allocator::ManageDataPtr param_dev; + miopen::Allocator::ManageDataPtr param_fp16_dev; + miopen::Allocator::ManageDataPtr grad_dev; + miopen::Allocator::ManageDataPtr exp_avg_dev; + miopen::Allocator::ManageDataPtr exp_avg_sq_dev; + miopen::Allocator::ManageDataPtr max_exp_avg_sq_dev; + miopen::Allocator::ManageDataPtr step_dev; + miopen::Allocator::ManageDataPtr found_inf_dev; + miopen::Allocator::ManageDataPtr grad_scale_dev; + + float lr = 0.0f; + float beta1 = 0.0f; + float beta2 = 0.0f; + float weight_decay = 0.0f; + float eps = 0.0f; + bool amsgrad = false; + bool maximize = false; + bool use_step_tensor = false; + int32_t step_count = 5; +}; diff --git a/test/gtest/addlayernorm.cpp b/test/gtest/addlayernorm.cpp new file mode 100644 index 0000000000..05b6fbdf66 --- /dev/null +++ b/test/gtest/addlayernorm.cpp @@ -0,0 +1,110 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "addlayernorm.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace addlayernorm { + +std::string GetFloatArg() +{ + const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct AddLayerNormTestFloat : AddLayerNormTest +{ +}; + +struct AddLayerNormTestHalf : AddLayerNormTest +{ +}; + +struct AddLayerNormTestBFloat16 : AddLayerNormTest +{ +}; + +} // namespace addlayernorm +using namespace addlayernorm; + +TEST_P(AddLayerNormTestFloat, AddLayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(AddLayerNormTestHalf, AddLayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(AddLayerNormTestBFloat16, AddLayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(AddLayerNormTestSet, + AddLayerNormTestFloat, + testing::ValuesIn(AddLayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(AddLayerNormTestSet, + AddLayerNormTestHalf, + testing::ValuesIn(AddLayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(AddLayerNormTestSet, + AddLayerNormTestBFloat16, + testing::ValuesIn(AddLayerNormTestConfigs())); diff --git a/test/gtest/addlayernorm.hpp b/test/gtest/addlayernorm.hpp new file mode 100644 index 0000000000..0be011e683 --- /dev/null +++ b/test/gtest/addlayernorm.hpp @@ -0,0 +1,359 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "../driver/tensor_driver.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include + +template +void cpu_addlayernorm_forward(tensor input, + tensor input2, + tensor weight, + tensor bias, + tensor& ref_output, + tensor& ref_mean, + tensor& ref_rstd, + float eps, + int32_t dim, + miopenNormMode_t mode) +{ + auto dims = input.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + size_t i = 0; + for(; i < dim; i++) + { + outer_size *= dims[i]; + } + + for(; i < dims.size(); i++) + { + inner_size *= dims[i]; + } + + par_ford(outer_size)([&](int32_t o) { + float mean_v = 0; + float var_v = 0; + + ford(inner_size)([&](int32_t i) { + float tmp = static_cast(input[o * inner_size + i]) + + static_cast(input2[o * inner_size + i]); + mean_v += tmp; + var_v += tmp * tmp; + }); + + mean_v = mean_v / inner_size; + var_v = var_v / inner_size - mean_v * mean_v; + float rstd_v = 1 / sqrt(var_v + eps); + + ref_mean[o] = static_cast(mean_v); + ref_rstd[o] = static_cast(rstd_v); + + ford(inner_size)([&](int32_t i) { + float weight_v = mode ? static_cast(weight[i]) : 1; + float bias_v = mode ? static_cast(bias[i]) : 0; + ref_output[o * inner_size + i] = + static_cast((static_cast(input[o * inner_size + i]) + + static_cast(input2[o * inner_size + i]) - mean_v) * + rstd_v * weight_v + + bias_v); + }); + }); +} + +struct AddLayerNormTestCase +{ + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + size_t nomalized_dim; + float eps; + miopenNormMode_t ln_mode; + friend std::ostream& operator<<(std::ostream& os, const AddLayerNormTestCase& tc) + { + return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H + << " W:" << tc.W << " dim:" << tc.nomalized_dim << " eps:" << tc.eps + << " LayerNorm_mode:" << tc.ln_mode; + } + + std::vector GetInput() + { + if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, D, H, W}); + } + else if((N != 0) && (C != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, H, W}); + } + else if((N != 0) && (C != 0) && (W != 0)) + { + return std::vector({N, C, W}); + } + else if((N != 0) && (W != 0)) + { + return std::vector({N, W}); + } + else + { + std::cout << "Error Input Tensor Lengths\n" << std::endl; + return std::vector({0}); + } + } +}; + +std::vector AddLayerNormTestConfigs() +{ // n c d h w nomalized_dim eps ln_mode + // clang-format off + return { + { 32, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 32x32x32 based on VoxNet arch + { 32, 1, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 256, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 32x32x32 based on VoxNet arch + { 256, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 256, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 256, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 512, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 32x32x32 based on VoxNet arch + { 512, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 512, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 512, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 2, 32, 57, 125 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { 32, 32, 14, 25, 59 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 6, 10, 27 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 4, 6, 11 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 2, 2, 3 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 32, 28, 62 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { 32, 32, 14, 12, 29 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 6, 4, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 32, 32, 4, 2, 2 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + { 16, 32, 6, 50, 50 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // Multi-view 3D convnet + { 1, 3, 8, 240, 320 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 3D convet on video + { 1, 3, 16, 240, 320 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 3D convet on video + { 1, 3, 8, 128, 171 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 3D convet on video + { 1, 3, 16, 128, 171 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 3D convet on video + { 1, 3, 8, 112, 112 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 3D convet on video + { 1, 3, 16, 112, 112 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, // 3D convet on video + { 32, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 32x32x32 based on VoxNet arch + { 32, 1, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 256, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 32x32x32 based on VoxNet arch + { 256, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 256, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 256, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 512, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 32x32x32 based on VoxNet arch + { 512, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 512, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 512, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 2, 32, 57, 125 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { 32, 32, 14, 25, 59 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 6, 10, 27 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 4, 6, 11 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 2, 2, 3 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 32, 28, 62 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { 32, 32, 14, 12, 29 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 6, 4, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 32, 32, 4, 2, 2 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + { 16, 32, 6, 50, 50 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // Multi-view 3D convnet + { 1, 3, 8, 240, 320 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 3D convet on video + { 1, 3, 16, 240, 320 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 3D convet on video + { 1, 3, 8, 128, 171 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 3D convet on video + { 1, 3, 16, 128, 171 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 3D convet on video + { 1, 3, 8, 112, 112 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 3D convet on video + { 1, 3, 16, 112, 112 , 4, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, // 3D convet on video + {32, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + {64, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + {32, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + {64, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + {32, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + {64, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD}, + {32, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD}, + {64, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS_FUSED_ADD} + }; + // clang-format on +} + +template +struct AddLayerNormTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + addlayernorm_config = GetParam(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + + nomalized_dim = addlayernorm_config.nomalized_dim; + eps = addlayernorm_config.eps; + ln_mode = addlayernorm_config.ln_mode; + + auto in_dim = addlayernorm_config.GetInput(); + + x = tensor{in_dim}.generate(gen_value); + x2 = tensor{in_dim}.generate(gen_value); + + std::vector inner_dim; + if(nomalized_dim == in_dim.size()) + inner_dim = {1}; + else + inner_dim = {in_dim.begin() + nomalized_dim, in_dim.end()}; + + if(ln_mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) + { + auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + weight = tensor{inner_dim}.generate(gen_one); + bias = tensor{inner_dim}.generate(gen_zero); + } + else + { + weight = tensor{inner_dim}.generate(gen_value); + bias = tensor{inner_dim}.generate(gen_value); + } + + std::vector outer_dim; + if(nomalized_dim == 0) + outer_dim = {1}; + else + outer_dim = {in_dim.begin(), in_dim.end() - (in_dim.size() - nomalized_dim)}; + + y = tensor{in_dim}; + mean = tensor{outer_dim}; + rstd = tensor{outer_dim}; + std::fill(y.begin(), y.end(), std::numeric_limits::quiet_NaN()); + std::fill(mean.begin(), mean.end(), std::numeric_limits::quiet_NaN()); + std::fill(rstd.begin(), rstd.end(), std::numeric_limits::quiet_NaN()); + + ref_y = tensor{in_dim}; + ref_mean = tensor{outer_dim}; + ref_rstd = tensor{outer_dim}; + std::fill(ref_y.begin(), ref_y.end(), std::numeric_limits::quiet_NaN()); + std::fill(ref_mean.begin(), ref_mean.end(), std::numeric_limits::quiet_NaN()); + std::fill(ref_rstd.begin(), ref_rstd.end(), std::numeric_limits::quiet_NaN()); + + x_dev = handle.Write(x.data); + x2_dev = handle.Write(x2.data); + weight_dev = handle.Write(weight.data); + bias_dev = handle.Write(bias.data); + y_dev = handle.Write(y.data); + mean_dev = handle.Write(mean.data); + rstd_dev = handle.Write(rstd.data); + } + void RunTest() + { + auto&& handle = get_handle(); + + cpu_addlayernorm_forward( + x, x2, weight, bias, ref_y, ref_mean, ref_rstd, eps, nomalized_dim, ln_mode); + miopenStatus_t status; + + status = miopen::AddLayerNormForward(handle, + x.desc, + x_dev.get(), + x2.desc, + x2_dev.get(), + weight.desc, + weight_dev.get(), + bias.desc, + bias_dev.get(), + y.desc, + y_dev.get(), + mean.desc, + mean_dev.get(), + rstd.desc, + rstd_dev.get(), + ln_mode, + eps, + nomalized_dim); + EXPECT_EQ(status, miopenStatusSuccess); + + y.data = handle.Read(y_dev, y.data.size()); + mean.data = handle.Read(mean_dev, mean.data.size()); + rstd.data = handle.Read(rstd_dev, rstd.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto threshold = std::is_same::value ? 1.5e-5 : 8.2e-2; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + threshold *= 8.0; + + auto error = miopen::rms_range(ref_y, y); + + EXPECT_TRUE(miopen::range_distance(ref_y) == miopen::range_distance(y)); + EXPECT_TRUE(error < threshold * 4) + << "Error y beyond tolerance Error:" << error << ", Threshold x 4: " << threshold * 4; + + error = miopen::rms_range(ref_mean, mean); + EXPECT_TRUE(miopen::range_distance(ref_mean) == miopen::range_distance(mean)); + EXPECT_TRUE(error < threshold) + << "Error mean beyond tolerance Error:" << error << ", Threshold: " << threshold; + + error = miopen::rms_range(ref_rstd, rstd); + EXPECT_TRUE(miopen::range_distance(ref_rstd) == miopen::range_distance(rstd)); + EXPECT_TRUE(error < threshold * 16) << "Error rstd beyond tolerance Error:" << error + << ", Threshold x 16: " << threshold * 16; + } + AddLayerNormTestCase addlayernorm_config; + + tensor x; + tensor x2; + tensor weight; + tensor bias; + tensor y; + tensor mean; + tensor rstd; + + tensor ref_y; + tensor ref_mean; + tensor ref_rstd; + + miopen::Allocator::ManageDataPtr x_dev; + miopen::Allocator::ManageDataPtr x2_dev; + miopen::Allocator::ManageDataPtr weight_dev; + miopen::Allocator::ManageDataPtr bias_dev; + miopen::Allocator::ManageDataPtr y_dev; + miopen::Allocator::ManageDataPtr mean_dev; + miopen::Allocator::ManageDataPtr rstd_dev; + + size_t nomalized_dim; + float eps; + miopenNormMode_t ln_mode; +}; diff --git a/test/gtest/gpu_mha_backward.cpp b/test/gtest/gpu_mha_backward.cpp new file mode 100644 index 0000000000..20398ea761 --- /dev/null +++ b/test/gtest/gpu_mha_backward.cpp @@ -0,0 +1,443 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "get_handle.hpp" +#include "mha_helper.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include "../workspace.hpp" +#include "../tensor_util.hpp" + +#include +#include + +#include + +#include +#include +#include +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +using namespace miopen; + +namespace { +inline bool CheckFloatArg(std::string_view arg) +{ + const std::string& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + return tmp.empty() || tmp == arg; +} + +struct TensorStruct +{ + template + TensorStruct(tensor&& val) : m_cpu_tensor(std::move(val)) + { + } + + TensorStruct(const TensorStruct&) = delete; + TensorStruct& operator=(const TensorStruct&) = delete; + + ~TensorStruct() = default; + + std::variant, tensor> m_cpu_tensor; + Allocator::ManageDataPtr m_gpu_buffer; +}; + +struct TestCase +{ + size_t n; + size_t h; + size_t s; + size_t d; + float dropout; +}; + +inline std::vector GetSmokeTestCases() +{ + if(CheckFloatArg("--float")) + { + return { + {9, 8, 8, 8, 0.0f}, + {1, 2, 4, 5, 0.0f}, + {2, 1, 5, 4, 0.0f}, + {4, 2, 1, 3, 0.0f}, + {5, 3, 4, 1, 0.0f}, + {1, 2, 65, 5, 0.0f}, + {2, 1, 67, 4, 0.0f}, + {8, 7, 68, 1, 0.0f}, + {1, 2, 257, 5, 0.0f}, + {2, 1, 259, 4, 0.0f}, + {8, 7, 270, 1, 0.0f}, + {1, 1, 1, 1, 0.0f}, + {3, 5, 32, 7, 0.8f}, + {2, 2, 64, 128, 0.8f}, + {2, 1, 128, 4, 0.8f}, + {2, 7, 256, 31, 0.8f}, + }; + } + else + { + return {}; + } +} + +inline std::vector GetFullTestCases() +{ + if(miopen::IsDisabled(ENV(MIOPEN_TEST_ALL))) + return {}; + + if(CheckFloatArg("--float")) + { + return { + {3, 15, 2047, 15, 0.0f}, + {2049, 17, 7, 7, 0.0f}, + {3, 3, 257, 91, 0.0f}, + {11, 150, 255, 31, 0.0f}, + {9, 3, 129, 1023, 0.0f}, + {3, 15, 31, 2047, 0.0f}, + {2049, 17, 32, 7, 0.2f}, + {11, 150, 256, 31, 0.4f}, + }; + } + else + { + return {}; + } +} +} // namespace + +class Test_Bwd_Mha : public testing::TestWithParam +{ +protected: + void SetUp() override + { + prng::reset_seed(); + auto [n, h, s, d, drop] = GetParam(); + Handle& handle = get_handle(); + + if((drop > 0.0f)) + { + GTEST_SKIP() << "CPU Dropout for backward pass currently is not supprorted"; + } + + if((drop > 0.0f) && (s % handle.GetWavefrontWidth() != 0)) + { + GTEST_SKIP() << "CPU Dropout currently supprorts only fully occupied warps"; + } + + mha_descriptor.SetParams(1); + ASSERT_EQ(miopenCreateMhaProblem(&problem, &mha_descriptor, miopenProblemDirectionBackward), + miopenStatusSuccess); + + auto InitTensor = [this, &handle](miopenTensorArgumentId_t id, auto&& tensor) { + auto tmp = std::make_unique(std::move(tensor)); + std::visit( + [this, id, &handle, &gpu_buff = tmp->m_gpu_buffer](auto&& cpu_tensor) { + ASSERT_EQ(miopenSetProblemTensorDescriptor(problem, id, &cpu_tensor.desc), + miopenStatusSuccess); + + gpu_buff = handle.Write(cpu_tensor.data); + descVector.push_back(&(cpu_tensor.desc)); + }, + tmp->m_cpu_tensor); + + args.emplace_back(); + args.back().id = id; + // args.back().descriptor will be filled later + args.back().buffer = tmp->m_gpu_buffer.get(); + + // check that we don't try to create duplicates + ASSERT_EQ(tensors.count(id), 0); + + tensors[id] = std::move(tmp); + }; + + auto GenScaledTensor = [](auto... nhsd) { + auto val_full = tensor{nhsd...}; + val_full.for_each([&](auto... id) { + // backward pass is very sensible to input data due to possible subtraction of + // similar values and later significant error amplification + val_full(id...) = prng::gen_descreet_uniform_sign(4, 60); + }); + float scale = 0.5f; + float descale = 1.0f / scale; + return std::tuple{val_full, scale, descale}; + }; + + float q_scale; + float q_descale; + tensor q_val; + std::tie(q_val, q_scale, q_descale) = GenScaledTensor(n, h, s, d); + InitTensor(miopenTensorMhaQ, std::move(q_val)); + + float k_scale; + float k_descale; + tensor k_val; + std::tie(k_val, k_scale, k_descale) = GenScaledTensor(n, h, s, d); + InitTensor(miopenTensorMhaK, std::move(k_val)); + + float v_scale; + float v_descale; + tensor v_val; + std::tie(v_val, v_scale, v_descale) = GenScaledTensor(n, h, s, d); + InitTensor(miopenTensorMhaV, std::move(v_val)); + + float s_scale = 1.f; + // clang-tidy complains about the same expression on both sides of "/": 1.f / 1.f + float s_descale = 1.f; // / s_scale; + + float o_scale = 1.f; + // clang-tidy complains about the same expression on both sides of "/": 1.f / 1.f + float o_descale = 1.f; // / o_scale; + + InitTensor(miopenTensorMhaDescaleQ, + tensor{1, 1, 1, 1}.generate([=](auto...) { return q_descale; })); + InitTensor(miopenTensorMhaDescaleK, + tensor{1, 1, 1, 1}.generate([=](auto...) { return k_descale; })); + InitTensor(miopenTensorMhaDescaleV, + tensor{1, 1, 1, 1}.generate([=](auto...) { return v_descale; })); + InitTensor(miopenTensorMhaDescaleS, + tensor{1, 1, 1, 1}.generate([=](auto...) { return s_descale; })); + InitTensor(miopenTensorMhaScaleS, + tensor{1, 1, 1, 1}.generate([=](auto...) { return s_scale; })); + + InitTensor(miopenTensorMhaDropoutProbability, + tensor{1, 1, 1, 1}.generate([rate = drop](auto...) { return rate; })); + InitTensor(miopenTensorMhaDropoutSeed, + tensor{1, 1, 1, 2}.generate([](auto...) { return 0; })); + InitTensor(miopenTensorMhaDropoutOffset, + tensor{1, 1, 1, 2}.generate([](auto...) { return 0; })); + + tensor softmax = tensor{n, h, s, s}; + tensor oDesc = tensor{n, h, s, d}; + tensor mDesc = tensor{n, h, s, 1}; + tensor zInvDesc = tensor{n, h, s, 1}; + float amaxS; + float amaxO; + + // proper O, M and zInv tensors are required for backward pass. + // randomly generated M and zInv may lead to nan\inf values + test::cpu::MultiHeadAttentionfp8( + std::get>(tensors[miopenTensorMhaQ]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaK]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaV]->m_cpu_tensor), + softmax, + mDesc, + zInvDesc, + q_descale, + k_descale, + v_descale, + s_descale, + s_scale, + o_scale, + drop, + 0, + 0, + amaxS, + amaxO, + oDesc); + + float dO_scale; + float dO_descale; + tensor dO_val; + std::tie(dO_val, dO_scale, dO_descale) = GenScaledTensor(n, h, s, d); + InitTensor(miopenTensorMhaDO, std::move(dO_val)); + + InitTensor(miopenTensorMhaO, std::move(oDesc)); + InitTensor(miopenTensorMhaM, std::move(mDesc)); + InitTensor(miopenTensorMhaZInv, std::move(zInvDesc)); + + float dS_scale = 1.f; + // clang-tidy complains about the same expression on both sides of "/": 1.f / 1.f + float dS_descale = 1.f; // / dS_scale; + + float dQ_scale = 1.f; + float dK_scale = 1.f; + float dV_scale = 1.f; + + InitTensor(miopenTensorMhaDescaleO, + tensor{1, 1, 1, 1}.generate([=](auto...) { return o_descale; })); + InitTensor(miopenTensorMhaDescaleDO, + tensor{1, 1, 1, 1}.generate([=](auto...) { return dO_descale; })); + InitTensor(miopenTensorMhaDescaleDS, + tensor{1, 1, 1, 1}.generate([=](auto...) { return dS_descale; })); + InitTensor(miopenTensorMhaScaleDS, + tensor{1, 1, 1, 1}.generate([=](auto...) { return dS_scale; })); + InitTensor(miopenTensorMhaScaleDQ, + tensor{1, 1, 1, 1}.generate([=](auto...) { return dQ_scale; })); + InitTensor(miopenTensorMhaScaleDK, + tensor{1, 1, 1, 1}.generate([=](auto...) { return dK_scale; })); + InitTensor(miopenTensorMhaScaleDV, + tensor{1, 1, 1, 1}.generate([=](auto...) { return dV_scale; })); + + InitTensor(miopenTensorMhaDQ, tensor{n, h, s, d}); + InitTensor(miopenTensorMhaDK, tensor{n, h, s, d}); + InitTensor(miopenTensorMhaDV, tensor{n, h, s, d}); + InitTensor(miopenTensorMhaAmaxDQ, tensor{1, 1, 1, 1}); + InitTensor(miopenTensorMhaAmaxDK, tensor{1, 1, 1, 1}); + InitTensor(miopenTensorMhaAmaxDV, tensor{1, 1, 1, 1}); + InitTensor(miopenTensorMhaAmaxDS, tensor{1, 1, 1, 1}); + + for(size_t i = 0; i < descVector.size(); ++i) + { + args[i].descriptor = &descVector[i]; + } + + dQDesc_ref = tensor{n, h, s, d}; + dKDesc_ref = tensor{n, h, s, d}; + dVDesc_ref = tensor{n, h, s, d}; + + test::cpu::MultiHeadAttentionBackwardDataf8( + std::get>(tensors[miopenTensorMhaQ]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaK]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaV]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaO]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaDO]->m_cpu_tensor), + softmax, + q_descale, + k_descale, + v_descale, + dQ_scale, + dK_scale, + dV_scale, + s_scale, + s_descale, + dS_scale, + dS_descale, + o_descale, + dO_descale, + amax_dS_ref, + amax_dQ_ref, + amax_dK_ref, + amax_dV_ref, + dQDesc_ref, + dKDesc_ref, + dVDesc_ref); + } + + void TearDown() override + { + if(problem) + { + ASSERT_EQ(miopenDestroyProblem(problem), miopenStatusSuccess); + } + } + + std::map> tensors; + std::vector descVector; + std::vector args; + + tensor dQDesc_ref; + tensor dKDesc_ref; + tensor dVDesc_ref; + float amax_dQ_ref; + float amax_dK_ref; + float amax_dV_ref; + float amax_dS_ref; + + MhaDescriptor mha_descriptor; + miopenProblem_t problem = nullptr; +}; + +TEST_P(Test_Bwd_Mha, Test_float) +{ + Handle& handle = get_handle(); + + auto FindSolutions = [&handle](miopenProblem_t problem) { + std::size_t found; + std::vector solutions(16); + if(miopenFindSolutions( + &handle, problem, nullptr, solutions.data(), &found, solutions.size()) != + miopenStatusSuccess) + { + found = 0; + } + + solutions.resize(found); + return solutions; + }; + + std::vector solutions = FindSolutions(problem); + ASSERT_GT(solutions.size(), 0); + + size_t workspace_size = 0; + Workspace workspace; + + auto GetResult = [this, &handle](miopenTensorArgumentId_t id) { + auto& tensorStructPtr = tensors[id]; + auto& cpu_tensor = std::get>(tensorStructPtr->m_cpu_tensor); + + cpu_tensor.data = handle.Read(tensorStructPtr->m_gpu_buffer, cpu_tensor.data.size()); + + return cpu_tensor; + }; + + const double error_threshold = 5e-6; + + auto checkAmax = [GetResult, error_threshold]( + miopenTensorArgumentId_t id, std::string_view name, float refAmax) { + const auto& resAmax = GetResult(id); + float amax_rel_diff = std::abs(refAmax - resAmax[0]); + float divisor = std::min(refAmax, resAmax[0]); + amax_rel_diff /= divisor > std::numeric_limits::min() ? divisor : 1.0f; + EXPECT_LT(amax_rel_diff, error_threshold) + << name << " ref: " << refAmax << " result: " << resAmax[0]; + }; + + auto checkOutput = [GetResult, error_threshold]( + miopenTensorArgumentId_t id, std::string_view name, const auto& ref) { + EXPECT_LT(miopen::rms_range(ref, GetResult(id)), error_threshold) << name; + }; + + for(const auto& solution : solutions) + { + miopenGetSolutionWorkspaceSize(solution, &workspace_size); + workspace.resize(workspace_size); + + ASSERT_EQ( + miopenRunSolution( + &handle, solution, args.size(), args.data(), workspace.ptr(), workspace.size()), + miopenStatusSuccess); + + checkAmax(miopenTensorMhaAmaxDQ, "amax dQ", amax_dQ_ref); + checkAmax(miopenTensorMhaAmaxDK, "amax dK", amax_dK_ref); + checkAmax(miopenTensorMhaAmaxDV, "amax dV", amax_dV_ref); + checkAmax(miopenTensorMhaAmaxDS, "amax dS", amax_dS_ref); + + checkOutput(miopenTensorMhaDQ, "tensor dQ", dQDesc_ref); + checkOutput(miopenTensorMhaDK, "tensor dK", dKDesc_ref); + checkOutput(miopenTensorMhaDV, "tensor dV", dVDesc_ref); + } +}; + +INSTANTIATE_TEST_SUITE_P(Bwd_Mha_Smoke, Test_Bwd_Mha, testing::ValuesIn(GetSmokeTestCases())); + +INSTANTIATE_TEST_SUITE_P(Bwd_Mha_Full, Test_Bwd_Mha, testing::ValuesIn(GetFullTestCases())); + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Test_Bwd_Mha); diff --git a/test/gtest/gpu_mha_forward.cpp b/test/gtest/gpu_mha_forward.cpp new file mode 100644 index 0000000000..aea0a84e4a --- /dev/null +++ b/test/gtest/gpu_mha_forward.cpp @@ -0,0 +1,350 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "get_handle.hpp" +#include "mha_helper.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include "../workspace.hpp" + +#include +#include + +#include + +#include +#include +#include +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +using namespace miopen; +namespace { +inline bool CheckFloatArg(std::string_view arg) +{ + const std::string& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + return tmp.empty() || tmp == arg; +} + +struct TensorStruct +{ + template + TensorStruct(tensor&& val) : m_cpu_tensor(std::move(val)) + { + } + + TensorStruct(const TensorStruct&) = delete; + TensorStruct& operator=(const TensorStruct&) = delete; + + ~TensorStruct() = default; + + std::variant, tensor> m_cpu_tensor; + Allocator::ManageDataPtr m_gpu_buffer; +}; + +struct TestCase +{ + size_t n; + size_t h; + size_t s; + size_t d; + float dropout; +}; + +inline std::vector GetSmokeTestCases() +{ + if(CheckFloatArg("--float")) + { + return { + {9, 8, 8, 8, 0.0f}, + {1, 2, 4, 5, 0.0f}, + {2, 1, 5, 4, 0.0f}, + {4, 2, 1, 3, 0.0f}, + {5, 3, 4, 1, 0.0f}, + {1, 2, 65, 5, 0.0f}, + {2, 1, 67, 4, 0.0f}, + {8, 7, 68, 1, 0.0f}, + {1, 2, 257, 5, 0.0f}, + {2, 1, 259, 4, 0.0f}, + {8, 7, 270, 1, 0.0f}, + {1, 1, 1, 1, 0.0f}, + {3, 5, 32, 7, 0.8f}, + {2, 2, 64, 128, 0.8f}, + {2, 1, 128, 4, 0.8f}, + {2, 7, 256, 31, 0.8f}, + }; + } + else + { + return {}; + } +} + +inline std::vector GetFullTestCases() +{ + if((miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) && + CheckFloatArg("--float")) + { + return { + {3, 15, 2047, 15, 0.0f}, + {2049, 17, 7, 7, 0.0f}, + {3, 3, 257, 91, 0.0f}, + {11, 150, 255, 31, 0.0f}, + {9, 3, 129, 1023, 0.0f}, + {3, 15, 31, 2047, 0.0f}, + {2049, 17, 32, 7, 0.2f}, + {11, 150, 256, 31, 0.4f}, + }; + } + else + { + return {}; + } +} +} // namespace +class Test_Fwd_Mha : public testing::TestWithParam +{ +protected: + void SetUp() override + { + prng::reset_seed(); + auto [n, h, s, d, drop] = GetParam(); + Handle& handle = get_handle(); + + if((drop > 0.0f) && (s % handle.GetWavefrontWidth() != 0)) + { + GTEST_SKIP() << "CPU Dropout currently supprorts only fully occupied warps"; + } + + mha_descriptor.SetParams(1); + ASSERT_EQ(miopenCreateMhaProblem(&problem, &mha_descriptor, miopenProblemDirectionForward), + miopenStatusSuccess); + + auto InitTensor = [this, &handle](miopenTensorArgumentId_t id, auto&& tensor) { + auto tmp = std::make_unique(std::move(tensor)); + std::visit( + [this, id, &handle, &gpu_buff = tmp->m_gpu_buffer](auto&& cpu_tensor) { + EXPECT_EQ(miopenSetProblemTensorDescriptor(problem, id, &cpu_tensor.desc), + miopenStatusSuccess); + + gpu_buff = handle.Write(cpu_tensor.data); + descVector.push_back(&(cpu_tensor.desc)); + }, + tmp->m_cpu_tensor); + + args.emplace_back(); + args.back().id = id; + // args.back().descriptor will be filled later + args.back().buffer = tmp->m_gpu_buffer.get(); + + tensors[id] = std::move(tmp); + }; + + auto GenScaledTensor = [](auto... nhsd) { + float bias = prng::gen_A_to_B(-3.0f, 3.0f); + auto val_full = tensor{nhsd...}.generate( + [bias](auto...) { return prng::gen_A_to_B(-2.5f + bias, 2.5f + bias); }); + auto val_scaled = tensor{nhsd...}; + float scale = test::cpu::GetF8Scaling(test::cpu::AbsoluteMax(val_full)); + float descale = 1.f / scale; + test::cpu::ScaleMult(val_full, scale, val_scaled); + return std::tuple{val_scaled, scale, descale}; + }; + + float q_scale; + float q_descale; + tensor q_val; + std::tie(q_val, q_scale, q_descale) = GenScaledTensor(n, h, s, d); + InitTensor(miopenTensorMhaQ, std::move(q_val)); + + float k_scale; + float k_descale; + tensor k_val; + std::tie(k_val, k_scale, k_descale) = GenScaledTensor(n, h, s, d); + InitTensor(miopenTensorMhaK, std::move(k_val)); + + float v_scale; + float v_descale; + tensor v_val; + std::tie(v_val, v_scale, v_descale) = GenScaledTensor(n, h, s, d); + InitTensor(miopenTensorMhaV, std::move(v_val)); + + float s_scale = 1.f; + // clang-tidy complains about the same expression on both sides of "/": 1.f / 1.f + float s_descale = 1.f; // / s_scale; + + float o_scale = 1.f; + // clang-tidy complains about the same expression on both sides of "/": 1.f / 1.f + + InitTensor(miopenTensorMhaDescaleQ, + tensor{1, 1, 1, 1}.generate([=](auto...) { return q_descale; })); + InitTensor(miopenTensorMhaDescaleK, + tensor{1, 1, 1, 1}.generate([=](auto...) { return k_descale; })); + InitTensor(miopenTensorMhaDescaleV, + tensor{1, 1, 1, 1}.generate([=](auto...) { return v_descale; })); + InitTensor(miopenTensorMhaDescaleS, + tensor{1, 1, 1, 1}.generate([=](auto...) { return s_descale; })); + InitTensor(miopenTensorMhaScaleS, + tensor{1, 1, 1, 1}.generate([=](auto...) { return s_scale; })); + InitTensor(miopenTensorMhaScaleO, + tensor{1, 1, 1, 1}.generate([=](auto...) { return o_scale; })); + + InitTensor(miopenTensorMhaDropoutProbability, + tensor{1, 1, 1, 1}.generate([rate = drop](auto...) { return rate; })); + InitTensor(miopenTensorMhaDropoutSeed, + tensor{1, 1, 1, 2}.generate([](auto...) { return 0; })); + InitTensor(miopenTensorMhaDropoutOffset, + tensor{1, 1, 1, 2}.generate([](auto...) { return 0; })); + + InitTensor(miopenTensorMhaO, tensor{n, h, s, d}); + InitTensor(miopenTensorMhaAmaxO, tensor{1, 1, 1, 1}); + InitTensor(miopenTensorMhaAmaxS, tensor{1, 1, 1, 1}); + InitTensor(miopenTensorMhaM, tensor{n, h, s, 1}); + InitTensor(miopenTensorMhaZInv, tensor{n, h, s, 1}); + + for(size_t i = 0; i < descVector.size(); ++i) + { + args[i].descriptor = &descVector[i]; + } + + tensor q_dot_k_transpose{n, h, s, s}; + + softmax_ref = tensor{n, h, s, s}; + oDesc_ref = tensor{n, h, s, d}; + mDesc_ref = tensor{n, h, s, 1}; + zInvDesc_ref = tensor{n, h, s, 1}; + + test::cpu::MultiHeadAttentionfp8( + std::get>(tensors[miopenTensorMhaQ]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaK]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaV]->m_cpu_tensor), + softmax_ref, + mDesc_ref, + zInvDesc_ref, + q_descale, + k_descale, + v_descale, + s_descale, + s_scale, + o_scale, + drop, + 0, + 0, + amaxS_ref, + amaxO_ref, + oDesc_ref); + } + + void TearDown() override + { + if(problem) + { + ASSERT_EQ(miopenDestroyProblem(problem), miopenStatusSuccess); + } + } + + std::map> tensors; + std::vector descVector; + std::vector args; + + // ref data + tensor softmax_ref; + tensor oDesc_ref; + tensor mDesc_ref; + tensor zInvDesc_ref; + float amaxS_ref; + float amaxO_ref; + + MhaDescriptor mha_descriptor; + miopenProblem_t problem = nullptr; +}; + +TEST_P(Test_Fwd_Mha, Test_float) +{ + Handle& handle = get_handle(); + + std::vector solutions(16); + std::size_t found; + + ASSERT_EQ( + miopenFindSolutions(&handle, problem, nullptr, solutions.data(), &found, solutions.size()), + miopenStatusSuccess); + ASSERT_GT(found, 0); + solutions.resize(found); + + size_t workspace_size = 0; + Workspace workspace; + + for(const auto& solution : solutions) + { + miopenGetSolutionWorkspaceSize(solution, &workspace_size); + workspace.resize(workspace_size); + + ASSERT_EQ( + miopenRunSolution( + &handle, solution, args.size(), args.data(), workspace.ptr(), workspace.size()), + miopenStatusSuccess); + + auto GetResult = [this, &handle](miopenTensorArgumentId_t id) { + auto& tensorStructPtr = tensors[id]; + auto& cpu_tensor = std::get>(tensorStructPtr->m_cpu_tensor); + + cpu_tensor.data = + handle.Read(tensorStructPtr->m_gpu_buffer, cpu_tensor.data.size()); + + return cpu_tensor; + }; + + const double error_threshold = 5e-6; + + const auto& resAmaxS = GetResult(miopenTensorMhaAmaxS); + auto amaxS_abs_diff = std::abs(amaxS_ref - resAmaxS[0]); + EXPECT_LT(amaxS_abs_diff, error_threshold) + << " ref: " << amaxS_ref << " result: " << resAmaxS[0]; + + const auto& resAmaxO = GetResult(miopenTensorMhaAmaxO); + auto amaxO_abs_diff = std::abs(amaxO_ref - resAmaxO[0]); + EXPECT_LT(amaxO_abs_diff, error_threshold) + << " ref: " << amaxO_ref << " result: " << resAmaxO[0]; + + double M_error = miopen::rms_range(mDesc_ref, GetResult(miopenTensorMhaM)); + EXPECT_LT(M_error, error_threshold); + + double ZInv_error = miopen::rms_range(zInvDesc_ref, GetResult(miopenTensorMhaZInv)); + EXPECT_LT(ZInv_error, error_threshold); + + double O_error = miopen::rms_range(oDesc_ref, GetResult(miopenTensorMhaO)); + EXPECT_LT(O_error, error_threshold); + } +}; + +INSTANTIATE_TEST_SUITE_P(Fwd_Mha_Smoke, Test_Fwd_Mha, testing::ValuesIn(GetSmokeTestCases())); + +INSTANTIATE_TEST_SUITE_P(Fwd_Mha_Full, Test_Fwd_Mha, testing::ValuesIn(GetFullTestCases())); + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Test_Fwd_Mha); diff --git a/test/gtest/gtest_common.hpp b/test/gtest/gtest_common.hpp index 3a52e4cfe3..b49d0f32ef 100644 --- a/test/gtest/gtest_common.hpp +++ b/test/gtest/gtest_common.hpp @@ -103,7 +103,7 @@ bool IsTestSupportedForDevMask() if constexpr(test(Gpu::gfx908)) res = res || (dev == "gfx908"); if constexpr(test(Gpu::gfx90A)) - res = res || (dev == "gfx90A"); + res = res || (dev == "gfx90a"); if constexpr(test(Gpu::gfx94X)) res = res || (miopen::StartsWith(dev, "gfx94")); if constexpr(test(Gpu::gfx103X)) diff --git a/test/gtest/layernorm.cpp b/test/gtest/layernorm.cpp index 91f918e2c6..13332d0f80 100644 --- a/test/gtest/layernorm.cpp +++ b/test/gtest/layernorm.cpp @@ -46,6 +46,14 @@ struct LayerNormTestFloat : LayerNormTest { }; +struct LayerNormTestHalf : LayerNormTest +{ +}; + +struct LayerNormTestBFloat16 : LayerNormTest +{ +}; + } // namespace layernorm using namespace layernorm; @@ -67,6 +75,48 @@ TEST_P(LayerNormTestFloat, LayerNormTestFw) } }; +TEST_P(LayerNormTestHalf, LayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + const auto& handle = get_handle(); + if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") || + miopen::StartsWith(handle.GetDeviceName(), "gfx90a") || + miopen::StartsWith(handle.GetDeviceName(), "gfx94")) && + miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(LayerNormTestBFloat16, LayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + const auto& handle = get_handle(); + if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") || + miopen::StartsWith(handle.GetDeviceName(), "gfx90a") || + miopen::StartsWith(handle.GetDeviceName(), "gfx94")) && + miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + INSTANTIATE_TEST_SUITE_P(LayerNormTestSet, LayerNormTestFloat, testing::ValuesIn(LayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(LayerNormTestSet, + LayerNormTestHalf, + testing::ValuesIn(LayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(LayerNormTestSet, + LayerNormTestBFloat16, + testing::ValuesIn(LayerNormTestConfigs())); diff --git a/test/gtest/layernorm.hpp b/test/gtest/layernorm.hpp index 454b826adb..a50fe031de 100644 --- a/test/gtest/layernorm.hpp +++ b/test/gtest/layernorm.hpp @@ -25,7 +25,6 @@ *******************************************************************************/ #include "../driver/tensor_driver.hpp" -#include "cpu_layernorm.hpp" #include "get_handle.hpp" #include "random.hpp" #include "tensor_holder.hpp" @@ -34,6 +33,58 @@ #include #include +template +void cpu_layernorm_forward(tensor input, + tensor weight, + tensor bias, + tensor& ref_output, + tensor& ref_mean, + tensor& ref_rstd, + float eps, + int32_t dim, + miopenNormMode_t mode) +{ + auto dims = input.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + size_t i = 0; + for(; i < dim; i++) + { + outer_size *= dims[i]; + } + + for(; i < dims.size(); i++) + { + inner_size *= dims[i]; + } + + par_ford(outer_size)([&](int32_t o) { + float mean_v = 0; + float var_v = 0; + + ford(inner_size)([&](int32_t i) { + float tmp = static_cast(input[o * inner_size + i]); + mean_v += tmp; + var_v += tmp * tmp; + }); + + mean_v = mean_v / inner_size; + var_v = var_v / inner_size - mean_v * mean_v; + float rstd_v = 1 / sqrt(var_v + eps); + + ref_mean[o] = static_cast(mean_v); + ref_rstd[o] = static_cast(rstd_v); + + ford(inner_size)([&](int32_t i) { + float weight_v = mode ? static_cast(weight[i]) : 1; + float bias_v = mode ? static_cast(bias[i]) : 0; + ref_output[o * inner_size + i] = static_cast( + (static_cast(input[o * inner_size + i]) - mean_v) * rstd_v * weight_v + + bias_v); + }); + }); +} + struct LayerNormTestCase { size_t N; @@ -81,72 +132,72 @@ std::vector LayerNormTestConfigs() { // n c d h w nomalized_dim eps ln_mode // clang-format off return { - { 32, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch - { 32, 1, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 256, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch - { 256, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 256, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 256, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 512, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch - { 512, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 512, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 512, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 2, 32, 57, 125 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path - { 32, 32, 14, 25, 59 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 6, 10, 27 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 4, 6, 11 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 2, 2, 3 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 32, 28, 62 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path - { 32, 32, 14, 12, 29 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 6, 4, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 32, 32, 4, 2, 2 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - { 16, 32, 6, 50, 50 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Multi-view 3D convnet - { 1, 3, 8, 240, 320 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video - { 1, 3, 16, 240, 320 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video - { 1, 3, 8, 128, 171 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video - { 1, 3, 16, 128, 171 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video - { 1, 3, 8, 112, 112 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video - { 1, 3, 16, 112, 112 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video - { 32, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 32x32x32 based on VoxNet arch - { 32, 1, 14, 14, 14 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 256, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 32x32x32 based on VoxNet arch - { 256, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 256, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 256, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 512, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 32x32x32 based on VoxNet arch - { 512, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 512, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 512, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 2, 32, 57, 125 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path - { 32, 32, 14, 25, 59 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 6, 10, 27 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 4, 6, 11 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 2, 2, 3 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 32, 28, 62 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path - { 32, 32, 14, 12, 29 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 6, 4, 12 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 32, 32, 4, 2, 2 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, - { 16, 32, 6, 50, 50 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // Multi-view 3D convnet - { 1, 3, 8, 240, 320 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video - { 1, 3, 16, 240, 320 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video - { 1, 3, 8, 128, 171 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video - { 1, 3, 16, 128, 171 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video - { 1, 3, 8, 112, 112 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video - { 1, 3, 16, 112, 112 ,4 , 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video - {32, 4, 0, 4, 256 ,1 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - {64, 4, 0, 4, 256 ,1 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - {32, 4, 0, 4, 256 ,1 , 1e-5, MIOPEN_WEIGHT_BIAS}, - {64, 4, 0, 4, 256 ,1 , 1e-5, MIOPEN_WEIGHT_BIAS}, - {32, 0, 0, 0, 256 ,1 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - {64, 0, 0, 0, 256 ,1 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, - {32, 0, 0, 0, 256 ,1 , 1e-5, MIOPEN_WEIGHT_BIAS}, - {64, 0, 0, 0, 256 ,1 , 1e-5, MIOPEN_WEIGHT_BIAS} + { 32, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch + { 32, 1, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 256, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch + { 256, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 256, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 256, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 512, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch + { 512, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 512, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 512, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 2, 32, 57, 125 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { 32, 32, 14, 25, 59 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 6, 10, 27 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 4, 6, 11 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 2, 2, 3 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 32, 28, 62 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { 32, 32, 14, 12, 29 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 6, 4, 12 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 4, 2, 2 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 16, 32, 6, 50, 50 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Multi-view 3D convnet + { 1, 3, 8, 240, 320 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 16, 240, 320 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 8, 128, 171 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 16, 128, 171 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 8, 112, 112 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 16, 112, 112 , 4, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 32, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 32x32x32 based on VoxNet arch + { 32, 1, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 256, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 32x32x32 based on VoxNet arch + { 256, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 256, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 256, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 512, 1, 32, 32, 32 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 32x32x32 based on VoxNet arch + { 512, 32, 14, 14, 14 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 512, 32, 12, 12, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 512, 32, 6, 6, 6 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 2, 32, 57, 125 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { 32, 32, 14, 25, 59 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 6, 10, 27 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 4, 6, 11 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 2, 2, 3 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 32, 28, 62 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { 32, 32, 14, 12, 29 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 6, 4, 12 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 32, 32, 4, 2, 2 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, + { 16, 32, 6, 50, 50 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // Multi-view 3D convnet + { 1, 3, 8, 240, 320 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video + { 1, 3, 16, 240, 320 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video + { 1, 3, 8, 128, 171 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video + { 1, 3, 16, 128, 171 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video + { 1, 3, 8, 112, 112 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video + { 1, 3, 16, 112, 112 , 4, 1e-5, MIOPEN_WEIGHT_BIAS}, // 3D convet on video + {32, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + {64, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + {32, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS}, + {64, 4, 0, 4, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS}, + {32, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + {64, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + {32, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS}, + {64, 0, 0, 0, 256 , 1, 1e-5, MIOPEN_WEIGHT_BIAS} }; // clang-format on } @@ -249,22 +300,28 @@ struct LayerNormTest : public ::testing::TestWithParam void Verify() { - double threshold = std::numeric_limits::epsilon(); - auto error = miopen::rms_range(ref_output, output); + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto threshold = std::is_same::value ? 1.5e-5 : 8.2e-2; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + threshold *= 8.0; + auto error = miopen::rms_range(ref_output, output); EXPECT_TRUE(miopen::range_distance(ref_output) == miopen::range_distance(output)); - EXPECT_TRUE(error < threshold * 1000) << "Error output beyond tolerance Error:" << error - << ", Thresholdx1000: " << threshold * 1000; + EXPECT_TRUE(error < threshold) + << "Error output beyond tolerance Error:" << error << ", Threshold: " << threshold; error = miopen::rms_range(ref_mean, mean); EXPECT_TRUE(miopen::range_distance(ref_mean) == miopen::range_distance(mean)); - EXPECT_TRUE(error < threshold * 20) << "Error mean beyond tolerance Error:" << error - << ", Thresholdx20: " << threshold * 20; + EXPECT_TRUE(error < threshold) + << "Error mean beyond tolerance Error:" << error << ", Threshold: " << threshold; error = miopen::rms_range(ref_rstd, rstd); EXPECT_TRUE(miopen::range_distance(ref_rstd) == miopen::range_distance(rstd)); - EXPECT_TRUE(error < threshold * 2000) << "Error rstd beyond tolerance Error:" << error - << ", Thresholdx2000: " << threshold * 2000; + EXPECT_TRUE(error < threshold * 4) << "Error rstd beyond tolerance Error:" << error + << ", Threshold x 4: " << threshold * 4; } LayerNormTestCase layernorm_config; diff --git a/test/gtest/reduce_custom_fp32.cpp b/test/gtest/reduce_custom_fp32.cpp index 17a1ff2beb..848cd62109 100644 --- a/test/gtest/reduce_custom_fp32.cpp +++ b/test/gtest/reduce_custom_fp32.cpp @@ -30,6 +30,7 @@ #include #include "get_handle.hpp" +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) namespace reduce_custom_fp32 { @@ -48,7 +49,7 @@ std::vector GetTestCases(void) // clang-format off return std::vector{ - {cmd + float_arg + " --scales 1 0 --CompType 1 --D 1024 30528 1 --I 0 --N 1 ---ReduceOp 0 --R 0 1 2"} + {cmd + float_arg + " --scales 1 0 --CompType 1 --D 1024 30528 1 --I 0 --N 1 --ReduceOp 0 --R 0 1 2"} }; // clang-format on } @@ -69,7 +70,9 @@ bool IsTestSupportedForDevice() void Run2dDriver(void) { if(!(IsTestSupportedForDevice() && - miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)) == "--float")) + (miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) // standalone run + || (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) // or --float full tests enabled + && miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)) == "--float")))) { GTEST_SKIP(); } diff --git a/test/gtest/reduce_custom_fp32_fp16.cpp b/test/gtest/reduce_custom_fp32_fp16.cpp index 819bf3f80a..c6220f0e3d 100644 --- a/test/gtest/reduce_custom_fp32_fp16.cpp +++ b/test/gtest/reduce_custom_fp32_fp16.cpp @@ -48,26 +48,26 @@ std::vector GetTestCases(const std::string& precision) // clang-format off return std::vector{ - {cmd + " --scales 1 0 --CompType 1 --D 8 2 1 --I 0 --N 1 ---ReduceOp 0 --R 0 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 160 10 1 --I 0 --N 1 ---ReduceOp 0 --R 0 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 7 1024 1 --I 0 --N 1 ---ReduceOp 0 --R 0 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 1 1 --I 0 --N 1 ---ReduceOp 0 --R 0 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 1 1 --I 0 --N 1 ---ReduceOp 1 --R 0 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 1 1 --I 1 --N 1 ---ReduceOp 3 --R 0 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 2 1 --I 1 --N 1 ---ReduceOp 3 --R 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 6 2 1 --I 0 --N 1 ---ReduceOp 3 --R 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 6 2 1 --I 0 --N 1 ---ReduceOp 2 --R 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 2 2 1 --I 0 --N 1 ---ReduceOp 0 --R 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 4 3 1 --I 0 --N 1 ---ReduceOp 3 --R 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 ---ReduceOp 3 --R 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 ---ReduceOp 3 --R 0 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 2048 32 1 --I 0 --N 1 ---ReduceOp 3 --R 0 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 4 3 1 --I 0 --N 1 ---ReduceOp 2 --R 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 ---ReduceOp 2 --R 0 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 2048 32 1 --I 0 --N 1 ---ReduceOp 2 --R 0 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 ---ReduceOp 2 --R 0 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 12 11 1 --I 0 --N 1 ---ReduceOp 0 --R 0 1 2"}, - {cmd + " --scales 1 0 --CompType 1 --D 13 4 7 7 --I 0 --N 1 ---ReduceOp 0 --R 0 1 2 3"}, + {cmd + " --scales 1 0 --CompType 1 --D 8 2 1 --I 0 --N 1 --ReduceOp 0 --R 0 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 160 10 1 --I 0 --N 1 --ReduceOp 0 --R 0 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 7 1024 1 --I 0 --N 1 --ReduceOp 0 --R 0 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 1 1 --I 0 --N 1 --ReduceOp 0 --R 0 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 1 1 --I 0 --N 1 --ReduceOp 1 --R 0 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 1 1 --I 1 --N 1 --ReduceOp 3 --R 0 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 2 1 --I 1 --N 1 --ReduceOp 3 --R 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 6 2 1 --I 0 --N 1 --ReduceOp 3 --R 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 6 2 1 --I 0 --N 1 --ReduceOp 2 --R 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 2 2 1 --I 0 --N 1 --ReduceOp 0 --R 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 4 3 1 --I 0 --N 1 --ReduceOp 3 --R 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 --ReduceOp 3 --R 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 --ReduceOp 3 --R 0 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 2048 32 1 --I 0 --N 1 --ReduceOp 3 --R 0 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 4 3 1 --I 0 --N 1 --ReduceOp 2 --R 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 --ReduceOp 2 --R 0 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 2048 32 1 --I 0 --N 1 --ReduceOp 2 --R 0 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 3 4 1 --I 0 --N 1 --ReduceOp 2 --R 0 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 12 11 1 --I 0 --N 1 --ReduceOp 0 --R 0 1 2"}, + {cmd + " --scales 1 0 --CompType 1 --D 13 4 7 7 --I 0 --N 1 --ReduceOp 0 --R 0 1 2 3"}, {cmd + " --scales 1 0 --CompType 1 --D 64 3 280 81 --I 0 --N 0 --ReduceOp 0 --R 0"} }; // clang-format on diff --git a/test/gtest/reduceextreme.cpp b/test/gtest/reduceextreme.cpp new file mode 100644 index 0000000000..e9e4fb3334 --- /dev/null +++ b/test/gtest/reduceextreme.cpp @@ -0,0 +1,134 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "reduceextreme.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace reduceextreme { + +std::string GetFloatArg() +{ + const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct ReduceExtremeTestFloat : ReduceExtremeTest +{ +}; + +struct ReduceExtremeTestHalf : ReduceExtremeTest +{ +}; + +struct ReduceExtremeTestBFloat16 : ReduceExtremeTest +{ +}; + +} // namespace reduceextreme +using namespace reduceextreme; + +TEST_P(ReduceExtremeTestFloat, ReduceExtremeTestFw) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(ReduceExtremeTestHalf, ReduceExtremeTestFw) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(ReduceExtremeTestBFloat16, ReduceExtremeTestFw) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetMIN, + ReduceExtremeTestFloat, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_MIN))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetMAX, + ReduceExtremeTestFloat, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_MAX))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetARGMIN, + ReduceExtremeTestFloat, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_ARGMIN))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetARGMAX, + ReduceExtremeTestFloat, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_ARGMAX))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetMIN, + ReduceExtremeTestHalf, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_MIN))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetMAX, + ReduceExtremeTestHalf, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_MAX))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetARGMIN, + ReduceExtremeTestHalf, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_ARGMIN))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetARGMAX, + ReduceExtremeTestHalf, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_ARGMAX))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetMIN, + ReduceExtremeTestBFloat16, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_MIN))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetMAX, + ReduceExtremeTestBFloat16, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_MAX))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetARGMIN, + ReduceExtremeTestBFloat16, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_ARGMIN))); +INSTANTIATE_TEST_SUITE_P(ReduceExtremeTestSetARGMAX, + ReduceExtremeTestBFloat16, + testing::ValuesIn(ReduceExtremeTestConfigs(MIOPEN_REDUCE_EXTREME_ARGMAX))); diff --git a/test/gtest/reduceextreme.hpp b/test/gtest/reduceextreme.hpp new file mode 100644 index 0000000000..deed82116b --- /dev/null +++ b/test/gtest/reduceextreme.hpp @@ -0,0 +1,326 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "../driver/tensor_driver.hpp" +#include "../src/kernels/MIOpenReduceExtreme.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include + +template +bool compare_equal(T r1, T r2) +{ + return r1 == r2; +} + +template +void cpu_extreme_forward(tensor input, + tensor& ref_output, + tensor& ref_indice, + int32_t dim, + miopenReduceExtremeOp_t reduceExtremeOp) +{ + auto input_dims = input.desc.GetLengths(); + std::vector output_dims; + + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) || + reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) + output_dims = ref_output.desc.GetLengths(); + else + output_dims = ref_indice.desc.GetLengths(); + + auto reduce_size = input_dims[dim]; + auto output_numel = + std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies()); + + auto inner_size = std::accumulate( + input_dims.begin() + dim + 1, input_dims.end(), 1ULL, std::multiplies()); + + par_ford(output_numel)([&](size_t o) { + size_t input_idx = (o / inner_size) * inner_size * reduce_size + o % inner_size; + + int32_t extreme_idx = 0; + T extreme = input[input_idx]; + + ford(reduce_size)([&](size_t i) { + T val = input[input_idx]; + reduce_func{}.calculate(extreme, val, extreme_idx, i); + input_idx += inner_size; + }); + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) || + reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) + ref_output[o] = extreme; + ref_indice[o] = extreme_idx; + }); +} + +struct ReduceExtremeTestCase +{ + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + int32_t dim; + miopenReduceExtremeOp_t reduceExtremeOp; + friend std::ostream& operator<<(std::ostream& os, const ReduceExtremeTestCase& tc) + { + return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H + << " W:" << tc.W << " dim:" << tc.dim + << " reduceExtremeOp:" << tc.reduceExtremeOp; + } + + std::vector GetInput() + { + if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, D, H, W}); + } + else if((N != 0) && (C != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, H, W}); + } + else if((N != 0) && (C != 0) && (W != 0)) + { + return std::vector({N, C, W}); + } + else if((N != 0) && (W != 0)) + { + return std::vector({N, W}); + } + else if((N != 0)) + { + return std::vector({N}); + } + else + { + std::cout << "Error Input Tensor Lengths\n" << std::endl; + return std::vector({0}); + } + } +}; + +std::vector ReduceExtremeTestConfigs(miopenReduceExtremeOp_t reduceExtremeOp) +{ // n c d h w dim + // clang-format off + if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) + { + return { + { 2, 0, 0, 0, 242991, 0 , MIOPEN_REDUCE_EXTREME_MIN}, //maskrcnn + { 4, 0, 0, 0, 2004, 0 , MIOPEN_REDUCE_EXTREME_MIN}, + { 34, 0, 0, 0, 3234, 0 , MIOPEN_REDUCE_EXTREME_MIN}, //ssdlite + { 57, 0, 0, 0, 3234, 0 , MIOPEN_REDUCE_EXTREME_MIN} + }; + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) + { + return { + { 2, 0, 0, 0, 242991, 0 , MIOPEN_REDUCE_EXTREME_MAX}, //maskrcnn + { 4, 0, 0, 0, 2004, 0 , MIOPEN_REDUCE_EXTREME_MAX}, + { 34, 0, 0, 0, 3234, 0 , MIOPEN_REDUCE_EXTREME_MAX}, //ssdlite + { 57, 0, 0, 0, 3234, 0 , MIOPEN_REDUCE_EXTREME_MAX} + }; + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMIN) + { + return { + { 16, 21, 0, 513, 513, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, //deeplabv3m + { 24, 21, 0, 513, 513, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, //deeplabv3r + { 64, 21, 0, 230, 333, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, //fcn_resnet_lraspp + { 64, 21, 0, 215, 288, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, + { 1, 21, 0, 333, 500, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, //stdc + { 1, 21, 0, 375, 500, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, + { 15, 21, 0, 256, 256, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, //unet + { 22, 21, 0, 256, 256, 1 , MIOPEN_REDUCE_EXTREME_ARGMIN}, + { 21, 412, 0, 0, 500, 0 , MIOPEN_REDUCE_EXTREME_ARGMIN}, + { 21, 333, 0, 0, 500, 0 , MIOPEN_REDUCE_EXTREME_ARGMIN} + }; + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMAX) + { + return { + { 16, 21, 0, 513, 513, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, //deeplabv3m + { 24, 21, 0, 513, 513, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, //deeplabv3r + { 64, 21, 0, 230, 333, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, //fcn_resnet_lraspp + { 64, 21, 0, 215, 288, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, + { 1, 21, 0, 333, 500, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, //stdc + { 1, 21, 0, 375, 500, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, + { 15, 21, 0, 256, 256, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, //unet + { 22, 21, 0, 256, 256, 1 , MIOPEN_REDUCE_EXTREME_ARGMAX}, + { 21, 412, 0, 0, 500, 0 , MIOPEN_REDUCE_EXTREME_ARGMAX}, + { 21, 333, 0, 0, 500, 0 , MIOPEN_REDUCE_EXTREME_ARGMAX} + }; + } + return {}; + // clang-format on +} + +template +struct ReduceExtremeTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + reduceextreme_config = GetParam(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + + dim = reduceextreme_config.dim; + reduceExtremeOp = reduceextreme_config.reduceExtremeOp; + + auto in_dims = reduceextreme_config.GetInput(); + + input = tensor{in_dims}.generate(gen_value); + + std::vector out_dims; + + for(int32_t i = 0; i < in_dims.size(); ++i) + { + if(i != dim) + { + out_dims.push_back(in_dims[i]); + } + } + + indice = tensor{out_dims}; + std::fill(indice.begin(), indice.end(), std::numeric_limits::quiet_NaN()); + + ref_indice = tensor{out_dims}; + std::fill(ref_indice.begin(), ref_indice.end(), std::numeric_limits::quiet_NaN()); + + input_dev = handle.Write(input.data); + indice_dev = handle.Write(indice.data); + + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + output = tensor{out_dims}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_dims}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + output_dev = handle.Write(output.data); + } + } + void RunTest() + { + auto&& handle = get_handle(); + + if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMIN) + { + cpu_extreme_forward( + input, ref_output, ref_indice, dim, reduceExtremeOp); + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_ARGMAX) + { + cpu_extreme_forward( + input, ref_output, ref_indice, dim, reduceExtremeOp); + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) + { + cpu_extreme_forward( + input, ref_output, ref_indice, dim, reduceExtremeOp); + } + else if(reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) + { + cpu_extreme_forward( + input, ref_output, ref_indice, dim, reduceExtremeOp); + } + + miopenStatus_t status; + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + status = miopen::ReduceExtremeForward(handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + indice.desc, + indice_dev.get(), + dim, + reduceExtremeOp); + } + else + { + status = miopen::ReduceExtremeForward(handle, + input.desc, + input_dev.get(), + indice.desc, + indice_dev.get(), + dim, + reduceExtremeOp); + } + + EXPECT_EQ(status, miopenStatusSuccess); + + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + output.data = handle.Read(output_dev, output.data.size()); + } + indice.data = handle.Read(indice_dev, indice.data.size()); + } + + void Verify() + { + if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + { + auto error = miopen::rms_range(ref_output, output); + + EXPECT_TRUE(miopen::range_distance(ref_output) == miopen::range_distance(output)); + EXPECT_TRUE(std::abs(static_cast(error)) == 0.0f) + << "Error output beyond tolerance Error:" << error; + } + + auto error_idx = miopen::mismatch_idx(ref_indice, indice, compare_equal); + + EXPECT_TRUE(miopen::range_distance(ref_indice) == miopen::range_distance(indice)); + EXPECT_TRUE(error_idx >= miopen::range_distance(ref_indice)) + << "Error Indice does not equal at " << error_idx << std::endl; + } + ReduceExtremeTestCase reduceextreme_config; + + tensor input; + tensor output; + tensor indice; + + tensor ref_output; + tensor ref_indice; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr indice_dev; + + int32_t dim; + miopenReduceExtremeOp_t reduceExtremeOp; +}; diff --git a/test/gtest/t5layernorm.cpp b/test/gtest/t5layernorm.cpp new file mode 100644 index 0000000000..38df104bea --- /dev/null +++ b/test/gtest/t5layernorm.cpp @@ -0,0 +1,173 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "t5layernorm.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace t5layernorm { + +std::string GetFloatArg() +{ + const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct T5LayerNormTestFloat : T5LayerNormTest +{ +}; + +struct T5LayerNormTestHalf : T5LayerNormTest +{ +}; + +struct T5LayerNormTestBFloat16 : T5LayerNormTest +{ +}; + +struct T5LayerNormBwdTestFloat : T5LayerNormBwdTest +{ +}; + +struct T5LayerNormBwdTestHalf : T5LayerNormBwdTest +{ +}; + +struct T5LayerNormBwdTestBFloat16 : T5LayerNormBwdTest +{ +}; + +} // namespace t5layernorm +using namespace t5layernorm; + +TEST_P(T5LayerNormTestFloat, T5LayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(T5LayerNormTestHalf, T5LayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(T5LayerNormTestBFloat16, T5LayerNormTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(T5LayerNormBwdTestFloat, T5LayerNormBwdTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(T5LayerNormBwdTestHalf, T5LayerNormBwdTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(T5LayerNormBwdTestBFloat16, T5LayerNormBwdTestFw) +{ + auto TypeArg = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(T5LayerNormTestSet, + T5LayerNormTestFloat, + testing::ValuesIn(T5LayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(T5LayerNormTestSet, + T5LayerNormTestHalf, + testing::ValuesIn(T5LayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(T5LayerNormTestSet, + T5LayerNormTestBFloat16, + testing::ValuesIn(T5LayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(T5LayerNormTestSet, + T5LayerNormBwdTestFloat, + testing::ValuesIn(T5LayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(T5LayerNormTestSet, + T5LayerNormBwdTestHalf, + testing::ValuesIn(T5LayerNormTestConfigs())); +INSTANTIATE_TEST_SUITE_P(T5LayerNormTestSet, + T5LayerNormBwdTestBFloat16, + testing::ValuesIn(T5LayerNormTestConfigs())); diff --git a/test/gtest/t5layernorm.hpp b/test/gtest/t5layernorm.hpp new file mode 100644 index 0000000000..505336a130 --- /dev/null +++ b/test/gtest/t5layernorm.hpp @@ -0,0 +1,510 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "../driver/tensor_driver.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include + +template +void cpu_t5layernorm_forward(tensor x, + tensor weight, + tensor& ref_y, + tensor& ref_rstd, + float eps, + miopenNormMode_t mode) +{ + auto dims = x.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = dims[dims.size() - 1]; + + for(size_t i = 0ULL; i < dims.size() - 1; ++i) + { + outer_size *= dims[i]; + } + + par_ford(outer_size)([&](int32_t o) { + float pvar = 0; + + ford(inner_size)([&](int32_t i) { + float tmp = static_cast(x[o * inner_size + i]); + pvar += tmp * tmp; + }); + + pvar = pvar / inner_size; + float prstd = 1 / sqrt(pvar + eps); + + ref_rstd[o] = static_cast(prstd); + + ford(inner_size)([&](int32_t i) { + float pweight = mode ? static_cast(weight[i]) : 1; + ref_y[o * inner_size + i] = + static_cast(static_cast(x[o * inner_size + i]) * prstd * pweight); + }); + }); +} + +template +void cpu_t5layernorm_backward(tensor dy, + tensor x, + tensor weight, + tensor rstd, + tensor& ref_dx, + miopenNormMode_t mode) +{ + auto dims = dy.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = dims[dims.size() - 1]; + + for(size_t i = 0ULL; i < dims.size() - 1; ++i) + { + outer_size *= dims[i]; + } + + par_ford(outer_size)([&](int32_t o) { + float sum = 0; + + ford(inner_size)([&](int32_t i) { + float pweight = mode ? static_cast(weight[i]) : 1; + float pdy = (dy.GetSize() != 0) ? static_cast(dy[o * inner_size + i]) : 0; + float px = static_cast(x[o * inner_size + i]); + sum += pdy * px * pweight; + }); + + float s = 1 / static_cast(inner_size); + float prstd = static_cast(rstd[o]); + float a = sum * prstd * prstd * prstd * s; + + ford(inner_size)([&](int32_t i) { + float pweight = mode ? static_cast(weight[i]) : 1; + float pdy = (dy.GetSize() != 0) ? static_cast(dy[o * inner_size + i]) : 0; + + float val = prstd * pdy * pweight - a * static_cast(x[o * inner_size + i]); + ref_dx[o * inner_size + i] = static_cast(val); + }); + }); +} + +template +void cpu_t5layernorm_backward_weight( + tensor dy, tensor x, tensor rstd, tensor& ref_dw, miopenNormMode_t mode) +{ + auto dims = dy.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = dims[dims.size() - 1]; + + for(size_t i = 0ULL; i < dims.size() - 1; ++i) + { + outer_size *= dims[i]; + } + + par_ford(inner_size)([&](int32_t o) { + float sum = 0; + + ford(outer_size)([&](int32_t i) { + float prstd = static_cast(rstd[i]); + float pdy = (dy.GetSize() != 0) ? static_cast(dy[i * inner_size + o]) : 0; + float px = static_cast(x[i * inner_size + o]); + + sum += pdy * px * prstd; + }); + + ref_dw[o] = sum; + }); +} + +struct T5LayerNormTestCase +{ + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + float eps; + miopenNormMode_t ln_mode; + friend std::ostream& operator<<(std::ostream& os, const T5LayerNormTestCase& tc) + { + return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H + << " W:" << tc.W << " eps:" << tc.eps << " LayerNorm_mode:" << tc.ln_mode; + } + + std::vector GetInput() + { + if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, D, H, W}); + } + else if((N != 0) && (C != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, H, W}); + } + else if((N != 0) && (C != 0) && (W != 0)) + { + return std::vector({N, C, W}); + } + else if((N != 0) && (W != 0)) + { + return std::vector({N, W}); + } + else + { + std::cout << "Error Input Tensor Lengths\n" << std::endl; + return std::vector({0}); + } + } +}; + +std::vector T5LayerNormTestConfigs() +{ // n c d h w eps ln_mode + // clang-format off + return { + { 32, 1, 32, 32, 32, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 32x32x32 based on VoxNet arch + { 32, 1, 14, 14, 14, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 14, 14, 14, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 12, 12, 12, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 6, 6, 6, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 256, 1, 32, 32, 32, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 32x32x32 based on VoxNet arch + { 256, 32, 14, 14, 14, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 256, 32, 12, 12, 12, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 256, 32, 6, 6, 6, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 512, 1, 32, 32, 32, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 32x32x32 based on VoxNet arch + { 512, 32, 14, 14, 14, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 512, 32, 12, 12, 12, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 512, 32, 6, 6, 6, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 2, 32, 57, 125, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { 32, 32, 14, 25, 59, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 6, 10, 27, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 4, 6, 11, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 2, 2, 3, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 32, 28, 62, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { 32, 32, 14, 12, 29, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 6, 4, 12, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 32, 32, 4, 2, 2, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + { 16, 32, 6, 50, 50, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // Multi-view 3D convnet + { 1, 3, 8, 240, 320, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 3D convet on video + { 1, 3, 16, 240, 320, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 3D convet on video + { 1, 3, 8, 128, 171, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 3D convet on video + { 1, 3, 16, 128, 171, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 3D convet on video + { 1, 3, 8, 112, 112, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 3D convet on video + { 1, 3, 16, 112, 112, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, // 3D convet on video + { 32, 1, 32, 32, 32, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 32x32x32 based on VoxNet arch + { 32, 1, 14, 14, 14, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 14, 14, 14, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 12, 12, 12, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 6, 6, 6, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 256, 1, 32, 32, 32, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 32x32x32 based on VoxNet arch + { 256, 32, 14, 14, 14, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 256, 32, 12, 12, 12, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 256, 32, 6, 6, 6, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 512, 1, 32, 32, 32, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 32x32x32 based on VoxNet arch + { 512, 32, 14, 14, 14, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 512, 32, 12, 12, 12, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 512, 32, 6, 6, 6, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 2, 32, 57, 125, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { 32, 32, 14, 25, 59, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 6, 10, 27, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 4, 6, 11, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 2, 2, 3, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 32, 28, 62, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { 32, 32, 14, 12, 29, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 6, 4, 12, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 32, 32, 4, 2, 2, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + { 16, 32, 6, 50, 50, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // Multi-view 3D convnet + { 1, 3, 8, 240, 320, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 3D convet on video + { 1, 3, 16, 240, 320, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 3D convet on video + { 1, 3, 8, 128, 171, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 3D convet on video + { 1, 3, 16, 128, 171, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 3D convet on video + { 1, 3, 8, 112, 112, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 3D convet on video + { 1, 3, 16, 112, 112, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, // 3D convet on video + {32, 4, 0, 4, 256, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + {64, 4, 0, 4, 256, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + {32, 4, 0, 4, 256, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + {64, 4, 0, 4, 256, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + {32, 0, 0, 0, 256, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + {64, 0, 0, 0, 256, 1e-5, MIOPEN_ELEMENTWISE_AFFINE_T5}, + {32, 0, 0, 0, 256, 1e-5, MIOPEN_WEIGHT_BIAS_T5}, + {64, 0, 0, 0, 256, 1e-5, MIOPEN_WEIGHT_BIAS_T5} + }; + // clang-format on +} + +template +struct T5LayerNormTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + t5layernorm_config = GetParam(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + + eps = t5layernorm_config.eps; + ln_mode = t5layernorm_config.ln_mode; + + auto in_dim = t5layernorm_config.GetInput(); + x = tensor{in_dim}.generate(gen_value); + + std::vector inner_dim = {in_dim[in_dim.size() - 1]}; + + if(ln_mode == MIOPEN_ELEMENTWISE_AFFINE_T5) + { + auto gen_one = [&](auto...) { return 1; }; + weight = tensor{inner_dim}.generate(gen_one); + } + else + { + weight = tensor{inner_dim}.generate(gen_value); + } + + std::vector outer_dim; + + outer_dim = {in_dim.begin(), in_dim.end() - 1}; + + y = tensor{in_dim}; + rstd = tensor{outer_dim}; + std::fill(y.begin(), y.end(), std::numeric_limits::quiet_NaN()); + std::fill(rstd.begin(), rstd.end(), std::numeric_limits::quiet_NaN()); + + ref_y = tensor{in_dim}; + ref_rstd = tensor{outer_dim}; + std::fill(ref_y.begin(), ref_y.end(), std::numeric_limits::quiet_NaN()); + std::fill(ref_rstd.begin(), ref_rstd.end(), std::numeric_limits::quiet_NaN()); + + x_dev = handle.Write(x.data); + weight_dev = handle.Write(weight.data); + y_dev = handle.Write(y.data); + rstd_dev = handle.Write(rstd.data); + } + void RunTest() + { + auto&& handle = get_handle(); + + cpu_t5layernorm_forward(x, weight, ref_y, ref_rstd, eps, ln_mode); + + miopenStatus_t status; + status = miopen::T5LayerNormForward(handle, + x.desc, + x_dev.get(), + weight.desc, + weight_dev.get(), + y.desc, + y_dev.get(), + rstd.desc, + rstd_dev.get(), + ln_mode, + eps); + EXPECT_EQ(status, miopenStatusSuccess); + + y.data = handle.Read(y_dev, y.data.size()); + rstd.data = handle.Read(rstd_dev, rstd.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + // In the case of layernorm, there is a cumulative sum operation, and in the case of + // floating point operation, the result value can change if the order of the summed values + // is changed. So apply a threshold that is 10 times larger than other operations. + auto threshold = std::is_same::value ? 1.5e-5 : 8.2e-2; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + threshold *= 8.0; + auto error = miopen::rms_range(ref_y, y); + EXPECT_TRUE(miopen::range_distance(ref_y) == miopen::range_distance(y)); + EXPECT_TRUE(error < threshold) + << "Error y beyond tolerance Error:" << error << ", Threshold: " << threshold; + + error = miopen::rms_range(ref_rstd, rstd); + EXPECT_TRUE(miopen::range_distance(ref_rstd) == miopen::range_distance(rstd)); + EXPECT_TRUE(error < threshold * 4) << "Error rstd beyond tolerance Error:" << error + << ", Threshold x 4: " << threshold * 4; + } + T5LayerNormTestCase t5layernorm_config; + + tensor x; + tensor weight; + tensor y; + tensor rstd; + + tensor ref_y; + tensor ref_rstd; + + miopen::Allocator::ManageDataPtr x_dev; + miopen::Allocator::ManageDataPtr weight_dev; + miopen::Allocator::ManageDataPtr y_dev; + miopen::Allocator::ManageDataPtr rstd_dev; + + float eps; + miopenNormMode_t ln_mode; +}; + +template +struct T5LayerNormBwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + t5layernorm_config = GetParam(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + + ln_mode = t5layernorm_config.ln_mode; + + auto in_dim = t5layernorm_config.GetInput(); + std::vector outer_dim = {in_dim.begin(), in_dim.end() - 1}; + + x = tensor{in_dim}.generate(gen_value); + dy = tensor{in_dim}.generate(gen_value); + rstd = tensor{outer_dim}.generate(gen_value); + + std::vector inner_dim = {in_dim[in_dim.size() - 1]}; + + if(ln_mode == MIOPEN_ELEMENTWISE_AFFINE_T5) + { + auto gen_one = [&](auto...) { return 1; }; + weight = tensor{inner_dim}.generate(gen_one); + } + else + { + weight = tensor{inner_dim}.generate(gen_value); + } + + dx = tensor{in_dim}; + dw = tensor{inner_dim}; + std::fill(dx.begin(), dx.end(), std::numeric_limits::quiet_NaN()); + std::fill(dw.begin(), dw.end(), std::numeric_limits::quiet_NaN()); + + ref_dx = tensor{in_dim}; + ref_dw = tensor{inner_dim}; + std::fill(ref_dx.begin(), ref_dx.end(), std::numeric_limits::quiet_NaN()); + std::fill(ref_dw.begin(), ref_dw.end(), std::numeric_limits::quiet_NaN()); + + std::vector workspace_dims; + printf("GetT5LayerNormBackwardWorkspaceSize\n"); + ws_sizeInBytes = miopen::GetT5LayerNormBackwardWorkspaceSize( + handle, dy.desc, x.desc, weight.desc, rstd.desc, dx.desc, dw.desc, ln_mode); + if(ws_sizeInBytes == static_cast(-1)) + GTEST_SKIP(); + + workspace_dims.push_back(ws_sizeInBytes / sizeof(T)); + if(ws_sizeInBytes != 0) + { + workspace = tensor{workspace_dims}; + std::fill(workspace.begin(), workspace.end(), std::numeric_limits::quiet_NaN()); + workspace_dev = handle.Write(workspace.data); + } + + x_dev = handle.Write(x.data); + weight_dev = handle.Write(weight.data); + rstd_dev = handle.Write(rstd.data); + dy_dev = handle.Write(dy.data); + dx_dev = handle.Write(dx.data); + dw_dev = handle.Write(dw.data); + } + void RunTest() + { + auto&& handle = get_handle(); + cpu_t5layernorm_backward(dy, x, weight, rstd, ref_dx, ln_mode); + cpu_t5layernorm_backward_weight(dy, x, rstd, ref_dw, ln_mode); + + miopenStatus_t status; + + status = miopen::T5LayerNormBackward(handle, + workspace_dev.get(), + ws_sizeInBytes, + dy.desc, + dy_dev.get(), + x.desc, + x_dev.get(), + weight.desc, + weight_dev.get(), + rstd.desc, + rstd_dev.get(), + dx.desc, + dx_dev.get(), + dw.desc, + dw_dev.get(), + ln_mode); + + EXPECT_EQ(status, miopenStatusSuccess); + + dx.data = handle.Read(dx_dev, dx.data.size()); + dw.data = handle.Read(dw_dev, dw.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + // In the case of layernorm, there is a cumulative sum operation, and in the case of + // floating point operation, the result value can change if the order of the summed values + // is changed. So apply a threshold that is 10 times larger than other operations. + auto threshold = std::is_same::value ? 1.5e-5 : 8.2e-2; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + threshold *= 8.0; + + auto error = miopen::rms_range(ref_dx, dx); + EXPECT_TRUE(miopen::range_distance(ref_dx) == miopen::range_distance(dx)); + EXPECT_TRUE(error < threshold) + << "Error dx beyond tolerance Error:" << error << ", Threshold: " << threshold; + error = miopen::rms_range(ref_dw, dw); + EXPECT_TRUE(miopen::range_distance(ref_dw) == miopen::range_distance(dw)); + EXPECT_TRUE(error < threshold * 2) + << "Error dw beyond tolerance Error:" << error << ", Threshold x 2: " << threshold * 2; + } + T5LayerNormTestCase t5layernorm_config; + + tensor x; + tensor weight; + tensor rstd; + tensor dy; + tensor dx; + tensor dw; + tensor workspace; + + tensor ref_dx; + tensor ref_dw; + + miopen::Allocator::ManageDataPtr x_dev; + miopen::Allocator::ManageDataPtr weight_dev; + miopen::Allocator::ManageDataPtr rstd_dev; + miopen::Allocator::ManageDataPtr dy_dev; + miopen::Allocator::ManageDataPtr dx_dev; + miopen::Allocator::ManageDataPtr dw_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + + size_t ws_sizeInBytes; + + miopenNormMode_t ln_mode; +}; diff --git a/test/perfdb.cpp b/test/perfdb.cpp index 6928125ba9..168a492a0c 100644 --- a/test/perfdb.cpp +++ b/test/perfdb.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -881,7 +882,7 @@ class DbMultiThreadedTest : public DbTest "Testing " << ArgsHelper::db_class::Get() << " for multithreaded write access..."); - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; MIOPEN_LOG_CUSTOM(LoggingLevel::Default, "Test", "Initializing test data..."); @@ -893,12 +894,12 @@ class DbMultiThreadedTest : public DbTest const auto c = [&p]() MIOPEN_RETURNS(GetDbInstance(DbKinds::PerfDb, p, false)); { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { auto thread_body = [c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::WorkItem(i, c, "mt"); }; @@ -929,7 +930,7 @@ class DbMultiThreadedReadTest : public DbTest "Testing " << ArgsHelper::db_class::Get() << " for multithreaded read access..."); - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; MIOPEN_LOG_CUSTOM(LoggingLevel::Default, "Test", "Initializing test data..."); @@ -941,12 +942,12 @@ class DbMultiThreadedReadTest : public DbTest threads.reserve(DBMultiThreadedTestWork::threads_count); { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { threads.emplace_back([c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::ReadWorkItem(i, c, "mt"); }); } @@ -1345,7 +1346,7 @@ class DbMultiFileMultiThreadedReadTest : public DbMultiFileTest MIOPEN_LOG_CUSTOM( LoggingLevel::Default, "Test", "Testing db for multifile multithreaded read access..."); - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; MIOPEN_LOG_CUSTOM(LoggingLevel::Default, "Test", "Initializing test data..."); @@ -1361,12 +1362,12 @@ class DbMultiFileMultiThreadedReadTest : public DbMultiFileTest threads.reserve(DBMultiThreadedTestWork::threads_count); { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { threads.emplace_back([c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::ReadWorkItem(i, c, "mt"); }); } @@ -1391,7 +1392,7 @@ class DbMultiFileMultiThreadedTest : public DbMultiFileTest "Test", "Testing db for multifile multithreaded write access..."); - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; MIOPEN_LOG_CUSTOM(LoggingLevel::Default, "Test", "Initializing test data..."); @@ -1406,12 +1407,12 @@ class DbMultiFileMultiThreadedTest : public DbMultiFileTest }; { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { threads.emplace_back([c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::WorkItem(i, c, "mt"); }); } diff --git a/test/sqlite_perfdb.cpp b/test/sqlite_perfdb.cpp index 7c361cbe78..c2eddb8aa7 100644 --- a/test/sqlite_perfdb.cpp +++ b/test/sqlite_perfdb.cpp @@ -751,7 +751,7 @@ class DbMultiThreadedTest : public DbTest std::cout << "Testing db for multithreaded write access..." << std::endl; ResetDb(); - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; std::cout << "Initializing test data..." << std::endl; @@ -763,12 +763,12 @@ class DbMultiThreadedTest : public DbTest const auto c = [&p]() { return SQLitePerfDb(DbKinds::PerfDb, p, false); }; { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { threads.emplace_back([c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::WorkItem(i, c, "mt"); }); } @@ -790,7 +790,7 @@ class DbMultiThreadedReadTest : public DbTest { std::cout << "Testing db for multithreaded read access..." << std::endl; - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; std::cout << "Initializing test data..." << std::endl; @@ -802,12 +802,12 @@ class DbMultiThreadedReadTest : public DbTest threads.reserve(DBMultiThreadedTestWork::threads_count); { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { threads.emplace_back([c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::ReadWorkItem(i, c, "mt"); }); } @@ -1181,7 +1181,7 @@ class DbMultiFileMultiThreadedReadTest : public DbMultiFileTest { std::cout << "Testing db for multifile multithreaded read access..." << std::endl; - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; std::cout << "Initializing test data..." << std::endl; @@ -1197,12 +1197,12 @@ class DbMultiFileMultiThreadedReadTest : public DbMultiFileTest threads.reserve(DBMultiThreadedTestWork::threads_count); { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { threads.emplace_back([c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::ReadWorkItem(i, c, "mt"); }); } @@ -1224,7 +1224,7 @@ class DbMultiFileMultiThreadedTest : public DbMultiFileTest std::cout << "Testing db for multifile multithreaded write access..." << std::endl; ResetDb(); - std::mutex mutex; + std::shared_mutex mutex; std::vector threads; std::cout << "Initializing test data..." << std::endl; @@ -1239,12 +1239,12 @@ class DbMultiFileMultiThreadedTest : public DbMultiFileTest }; { - std::unique_lock lock(mutex); + std::unique_lock lock(mutex); for(auto i = 0u; i < DBMultiThreadedTestWork::threads_count; i++) { threads.emplace_back([c, &mutex, i]() { - (void)std::unique_lock(mutex); + std::shared_lock lock(mutex); DBMultiThreadedTestWork::WorkItem(i, c, "mt"); }); }