diff --git a/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp b/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp index 5ff7db02d..ba4154f5a 100644 --- a/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp +++ b/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp @@ -156,14 +156,18 @@ bool KHR16BitArithmeticSample::prepare(const vkb::ApplicationOptions &options) vkb::ShaderVariant variant; if (supports_push_constant16) { - variant.add_define("PUSH_CONSTANT_16"); + auto &module_fp16 = + device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT, + vkb::ShaderSource{"16bit_arithmetic/compute_buffer_fp16.comp"}, variant); + compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16}); + } + else + { + auto &module_fp16 = + device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT, + vkb::ShaderSource{"16bit_arithmetic/compute_buffer_fp16_fallback.comp"}, variant); + compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16}); } - - const char *shader = "16bit_arithmetic/compute_buffer_fp16.comp"; - auto &module_fp16 = - device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT, - vkb::ShaderSource{shader}, variant); - compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16}); } else { diff --git a/shaders/16bit_arithmetic/compute_buffer_fp16.comp b/shaders/16bit_arithmetic/compute_buffer_fp16.comp index 7e7aa5a6b..f92909a3f 100644 --- a/shaders/16bit_arithmetic/compute_buffer_fp16.comp +++ b/shaders/16bit_arithmetic/compute_buffer_fp16.comp @@ -1,5 +1,5 @@ #version 450 -/* Copyright (c) 2020-2021, Arm Limited and Contributors +/* Copyright (c) 2020-2024, Arm Limited and Contributors * * SPDX-License-Identifier: Apache-2.0 * @@ -24,89 +24,82 @@ layout(local_size_x = 8, local_size_y = 8) in; -layout(constant_id = 0) const uint WIDTH = 1; +layout(constant_id = 0) const uint WIDTH = 1; layout(constant_id = 1) const uint HEIGHT = 1; layout(set = 0, binding = 0) readonly buffer SSBO { - // It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually. - // The key feature of 16-bit storage is to allow scalar access to 16-bit values however. - // Avoiding extra unpacking and packing can also be useful. - f16vec4 blob_data[]; + // It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually. + // The key feature of 16-bit storage is to allow scalar access to 16-bit values however. + // Avoiding extra unpacking and packing can also be useful. + f16vec4 blob_data[]; }; layout(rgba16f, set = 0, binding = 1) writeonly uniform mediump image2D o_results; layout(push_constant) uniform Registers { - // Push constants can also be 16-bit. This can also be very useful since push constant space is so limited! -#ifdef PUSH_CONSTANT_16 - uint16_t num_blobs; - float16_t seed; - i16vec2 range; -#else - // Fallback for implementations which do not support PushConstant16. - uint num_blobs; - float seed; - ivec2 range; -#endif -} registers; + uint16_t num_blobs; + float16_t seed; + i16vec2 range; +} +registers; // This is very arbitrary. Expends a ton of arithmetic to compute // something that looks similar to a lens flare. f16vec4 compute_blob(f16vec2 pos, f16vec4 blob, float16_t seed) { - f16vec2 offset = pos - blob.xy; - f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf); - f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed); - - f16vec4 rg_dot = rg_offset * rg_offset; - f16vec4 bs_dot = bs_offset * bs_offset; - - // Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that. - // To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled - // such that we avoid swizzling across a 32-bit boundary. - f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w; - - // Now we have square distances to blob center. - - // Gotta have some FMAs, right? :D - dots = dots * dots + dots; - dots = dots * dots + dots; - dots = dots * dots + dots; - dots = dots * dots + dots; - dots = dots * dots + dots; - dots = dots * dots + dots; - - f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf)); - parabolas -= parabolas.w; - parabolas = max(parabolas, f16vec4(0.0hf)); - return parabolas; + f16vec2 offset = pos - blob.xy; + f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf); + f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed); + + f16vec4 rg_dot = rg_offset * rg_offset; + f16vec4 bs_dot = bs_offset * bs_offset; + + // Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that. + // To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled + // such that we avoid swizzling across a 32-bit boundary. + f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w; + + // Now we have square distances to blob center. + + // Gotta have some FMAs, right? :D + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + + f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf)); + parabolas -= parabolas.w; + parabolas = max(parabolas, f16vec4(0.0hf)); + return parabolas; } void main() { - uint num_blobs = uint(registers.num_blobs); + uint num_blobs = uint(registers.num_blobs); - float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5; - float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5; - f16vec2 pos = f16vec2(x, y); - f16vec4 result = f16vec4(0.0hf); - float16_t seed = float16_t(registers.seed); - ivec2 range = ivec2(registers.range); + float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5; + float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5; + f16vec2 pos = f16vec2(x, y); + f16vec4 result = f16vec4(0.0hf); + float16_t seed = float16_t(registers.seed); + ivec2 range = ivec2(registers.range); - const float16_t EXPAND_FACTOR = 0.3hf; - float16_t stride = seed * EXPAND_FACTOR; + const float16_t EXPAND_FACTOR = 0.3hf; + float16_t stride = seed * EXPAND_FACTOR; - for (uint i = 0; i < num_blobs; i++) - { - f16vec4 blob = blob_data[i]; + for (uint i = 0; i < num_blobs; i++) + { + f16vec4 blob = blob_data[i]; - // Get as much mileage out of the buffer load as possible. - for (int y = -range.y; y <= range.y; y++) - for (int x = -range.x; x <= range.x; x++) - result += compute_blob(pos + stride * f16vec2(x, y), blob, seed); - } + // Get as much mileage out of the buffer load as possible. + for (int y = -range.y; y <= range.y; y++) + for (int x = -range.x; x <= range.x; x++) + result += compute_blob(pos + stride * f16vec2(x, y), blob, seed); + } - imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result); + imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result); } diff --git a/shaders/16bit_arithmetic/compute_buffer_fp16_fallback.comp b/shaders/16bit_arithmetic/compute_buffer_fp16_fallback.comp new file mode 100644 index 000000000..876e8547a --- /dev/null +++ b/shaders/16bit_arithmetic/compute_buffer_fp16_fallback.comp @@ -0,0 +1,106 @@ +#version 450 +/* Copyright (c) 2020-2024, Arm Limited and Contributors + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Allows us to use float16_t for arithmetic purposes. +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require + +// Allows us to use int16_t, uint16_t and float16_t for buffers. +#extension GL_EXT_shader_16bit_storage : require + +layout(local_size_x = 8, local_size_y = 8) in; + +layout(constant_id = 0) const uint WIDTH = 1; +layout(constant_id = 1) const uint HEIGHT = 1; + +layout(set = 0, binding = 0) readonly buffer SSBO +{ + // It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually. + // The key feature of 16-bit storage is to allow scalar access to 16-bit values however. + // Avoiding extra unpacking and packing can also be useful. + f16vec4 blob_data[]; +}; + +layout(rgba16f, set = 0, binding = 1) writeonly uniform mediump image2D o_results; + +layout(push_constant) uniform Registers +{ + // Fallback for implementations which do not support PushConstant16. + uint num_blobs; + float seed; + ivec2 range; +} +registers; + +// This is very arbitrary. Expends a ton of arithmetic to compute +// something that looks similar to a lens flare. +f16vec4 compute_blob(f16vec2 pos, f16vec4 blob, float16_t seed) +{ + f16vec2 offset = pos - blob.xy; + f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf); + f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed); + + f16vec4 rg_dot = rg_offset * rg_offset; + f16vec4 bs_dot = bs_offset * bs_offset; + + // Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that. + // To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled + // such that we avoid swizzling across a 32-bit boundary. + f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w; + + // Now we have square distances to blob center. + + // Gotta have some FMAs, right? :D + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + dots = dots * dots + dots; + + f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf)); + parabolas -= parabolas.w; + parabolas = max(parabolas, f16vec4(0.0hf)); + return parabolas; +} + +void main() +{ + uint num_blobs = uint(registers.num_blobs); + + float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5; + float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5; + f16vec2 pos = f16vec2(x, y); + f16vec4 result = f16vec4(0.0hf); + float16_t seed = float16_t(registers.seed); + ivec2 range = ivec2(registers.range); + + const float16_t EXPAND_FACTOR = 0.3hf; + float16_t stride = seed * EXPAND_FACTOR; + + for (uint i = 0; i < num_blobs; i++) + { + f16vec4 blob = blob_data[i]; + + // Get as much mileage out of the buffer load as possible. + for (int y = -range.y; y <= range.y; y++) + for (int x = -range.x; x <= range.x; x++) + result += compute_blob(pos + stride * f16vec2(x, y), blob, seed); + } + + imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result); +}