From 0daa41cfbd3179326a4d91512b724ce643828f1b Mon Sep 17 00:00:00 2001
From: Tom Atkinson <tja.atkinson@gmail.com>
Date: Tue, 19 Nov 2024 21:36:36 +0000
Subject: [PATCH] Remove shader variants

---
 .../16bit_arithmetic/16bit_arithmetic.cpp     |  18 +--
 .../16bit_arithmetic/compute_buffer_fp16.comp | 117 ++++++++----------
 .../compute_buffer_fp16_fallback.comp         | 106 ++++++++++++++++
 3 files changed, 172 insertions(+), 69 deletions(-)
 create mode 100644 shaders/16bit_arithmetic/compute_buffer_fp16_fallback.comp

diff --git a/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp b/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp
index 5ff7db02d..ba4154f5a 100644
--- a/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp
+++ b/samples/performance/16bit_arithmetic/16bit_arithmetic.cpp
@@ -156,14 +156,18 @@ bool KHR16BitArithmeticSample::prepare(const vkb::ApplicationOptions &options)
 		vkb::ShaderVariant variant;
 		if (supports_push_constant16)
 		{
-			variant.add_define("PUSH_CONSTANT_16");
+			auto &module_fp16 =
+			    device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT,
+			                                                      vkb::ShaderSource{"16bit_arithmetic/compute_buffer_fp16.comp"}, variant);
+			compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16});
+		}
+		else
+		{
+			auto &module_fp16 =
+			    device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT,
+			                                                      vkb::ShaderSource{"16bit_arithmetic/compute_buffer_fp16_fallback.comp"}, variant);
+			compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16});
 		}
-
-		const char *shader = "16bit_arithmetic/compute_buffer_fp16.comp";
-		auto       &module_fp16 =
-		    device.get_resource_cache().request_shader_module(VK_SHADER_STAGE_COMPUTE_BIT,
-		                                                      vkb::ShaderSource{shader}, variant);
-		compute_layout_fp16 = &device.get_resource_cache().request_pipeline_layout({&module_fp16});
 	}
 	else
 	{
diff --git a/shaders/16bit_arithmetic/compute_buffer_fp16.comp b/shaders/16bit_arithmetic/compute_buffer_fp16.comp
index 7e7aa5a6b..f92909a3f 100644
--- a/shaders/16bit_arithmetic/compute_buffer_fp16.comp
+++ b/shaders/16bit_arithmetic/compute_buffer_fp16.comp
@@ -1,5 +1,5 @@
 #version 450
-/* Copyright (c) 2020-2021, Arm Limited and Contributors
+/* Copyright (c) 2020-2024, Arm Limited and Contributors
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,89 +24,82 @@
 
 layout(local_size_x = 8, local_size_y = 8) in;
 
-layout(constant_id = 0) const uint WIDTH = 1;
+layout(constant_id = 0) const uint WIDTH  = 1;
 layout(constant_id = 1) const uint HEIGHT = 1;
 
 layout(set = 0, binding = 0) readonly buffer SSBO
 {
-    // It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually.
-    // The key feature of 16-bit storage is to allow scalar access to 16-bit values however.
-    // Avoiding extra unpacking and packing can also be useful.
-    f16vec4 blob_data[];
+	// It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually.
+	// The key feature of 16-bit storage is to allow scalar access to 16-bit values however.
+	// Avoiding extra unpacking and packing can also be useful.
+	f16vec4 blob_data[];
 };
 
 layout(rgba16f, set = 0, binding = 1) writeonly uniform mediump image2D o_results;
 
 layout(push_constant) uniform Registers
 {
-    // Push constants can also be 16-bit. This can also be very useful since push constant space is so limited!
-#ifdef PUSH_CONSTANT_16
-    uint16_t num_blobs;
-    float16_t seed;
-    i16vec2 range;
-#else
-    // Fallback for implementations which do not support PushConstant16.
-    uint num_blobs;
-    float seed;
-    ivec2 range;
-#endif
-} registers;
+	uint16_t  num_blobs;
+	float16_t seed;
+	i16vec2   range;
+}
+registers;
 
 // This is very arbitrary. Expends a ton of arithmetic to compute
 // something that looks similar to a lens flare.
 f16vec4 compute_blob(f16vec2 pos, f16vec4 blob, float16_t seed)
 {
-    f16vec2 offset = pos - blob.xy;
-    f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf);
-    f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed);
-
-    f16vec4 rg_dot = rg_offset * rg_offset;
-    f16vec4 bs_dot = bs_offset * bs_offset;
-
-    // Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that.
-    // To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled
-    // such that we avoid swizzling across a 32-bit boundary.
-    f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w;
-
-    // Now we have square distances to blob center.
-
-    // Gotta have some FMAs, right? :D
-    dots = dots * dots + dots;
-    dots = dots * dots + dots;
-    dots = dots * dots + dots;
-    dots = dots * dots + dots;
-    dots = dots * dots + dots;
-    dots = dots * dots + dots;
-
-    f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf));
-    parabolas -= parabolas.w;
-    parabolas = max(parabolas, f16vec4(0.0hf));
-    return parabolas;
+	f16vec2 offset    = pos - blob.xy;
+	f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf);
+	f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed);
+
+	f16vec4 rg_dot = rg_offset * rg_offset;
+	f16vec4 bs_dot = bs_offset * bs_offset;
+
+	// Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that.
+	// To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled
+	// such that we avoid swizzling across a 32-bit boundary.
+	f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w;
+
+	// Now we have square distances to blob center.
+
+	// Gotta have some FMAs, right? :D
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+
+	f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf));
+	parabolas -= parabolas.w;
+	parabolas = max(parabolas, f16vec4(0.0hf));
+	return parabolas;
 }
 
 void main()
 {
-    uint num_blobs = uint(registers.num_blobs);
+	uint num_blobs = uint(registers.num_blobs);
 
-    float x = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5;
-    float y = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5;
-    f16vec2 pos = f16vec2(x, y);
-    f16vec4 result = f16vec4(0.0hf);
-    float16_t seed = float16_t(registers.seed);
-    ivec2 range = ivec2(registers.range);
+	float     x      = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5;
+	float     y      = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5;
+	f16vec2   pos    = f16vec2(x, y);
+	f16vec4   result = f16vec4(0.0hf);
+	float16_t seed   = float16_t(registers.seed);
+	ivec2     range  = ivec2(registers.range);
 
-    const float16_t EXPAND_FACTOR = 0.3hf;
-    float16_t stride = seed * EXPAND_FACTOR;
+	const float16_t EXPAND_FACTOR = 0.3hf;
+	float16_t       stride        = seed * EXPAND_FACTOR;
 
-    for (uint i = 0; i < num_blobs; i++)
-    {
-        f16vec4 blob = blob_data[i];
+	for (uint i = 0; i < num_blobs; i++)
+	{
+		f16vec4 blob = blob_data[i];
 
-        // Get as much mileage out of the buffer load as possible.
-        for (int y = -range.y; y <= range.y; y++)
-            for (int x = -range.x; x <= range.x; x++)
-                result += compute_blob(pos + stride * f16vec2(x, y), blob, seed);
-    }
+		// Get as much mileage out of the buffer load as possible.
+		for (int y = -range.y; y <= range.y; y++)
+			for (int x = -range.x; x <= range.x; x++)
+				result += compute_blob(pos + stride * f16vec2(x, y), blob, seed);
+	}
 
-    imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result);
+	imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result);
 }
diff --git a/shaders/16bit_arithmetic/compute_buffer_fp16_fallback.comp b/shaders/16bit_arithmetic/compute_buffer_fp16_fallback.comp
new file mode 100644
index 000000000..876e8547a
--- /dev/null
+++ b/shaders/16bit_arithmetic/compute_buffer_fp16_fallback.comp
@@ -0,0 +1,106 @@
+#version 450
+/* Copyright (c) 2020-2024, Arm Limited and Contributors
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Allows us to use float16_t for arithmetic purposes.
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+
+// Allows us to use int16_t, uint16_t and float16_t for buffers.
+#extension GL_EXT_shader_16bit_storage : require
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout(constant_id = 0) const uint WIDTH  = 1;
+layout(constant_id = 1) const uint HEIGHT = 1;
+
+layout(set = 0, binding = 0) readonly buffer SSBO
+{
+	// It is possible to use native 16-bit types in SSBOs and UBOs. We could use uvec2 here and unpack manually.
+	// The key feature of 16-bit storage is to allow scalar access to 16-bit values however.
+	// Avoiding extra unpacking and packing can also be useful.
+	f16vec4 blob_data[];
+};
+
+layout(rgba16f, set = 0, binding = 1) writeonly uniform mediump image2D o_results;
+
+layout(push_constant) uniform Registers
+{
+	// Fallback for implementations which do not support PushConstant16.
+	uint  num_blobs;
+	float seed;
+	ivec2 range;
+}
+registers;
+
+// This is very arbitrary. Expends a ton of arithmetic to compute
+// something that looks similar to a lens flare.
+f16vec4 compute_blob(f16vec2 pos, f16vec4 blob, float16_t seed)
+{
+	f16vec2 offset    = pos - blob.xy;
+	f16vec4 rg_offset = offset.xxyy * f16vec4(0.95hf, 1.0hf, 0.95hf, 1.0hf);
+	f16vec4 bs_offset = offset.xxyy * f16vec4(1.05hf, 1.1hf + seed, 1.05hf, 1.1hf + seed);
+
+	f16vec4 rg_dot = rg_offset * rg_offset;
+	f16vec4 bs_dot = bs_offset * bs_offset;
+
+	// Dot products can be somewhat awkward in FP16, since the result is a scalar 16-bit value, and we don't want that.
+	// To that end, we compute at least two dot products side by side, and rg_offset and bs_offset are swizzled
+	// such that we avoid swizzling across a 32-bit boundary.
+	f16vec4 dots = f16vec4(rg_dot.xy + rg_dot.zw, bs_dot.xy + bs_dot.zw) * blob.w;
+
+	// Now we have square distances to blob center.
+
+	// Gotta have some FMAs, right? :D
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+	dots = dots * dots + dots;
+
+	f16vec4 parabolas = max(f16vec4(1.0hf, 1.0hf, 1.0hf, 0.9hf) - dots, f16vec4(0.0hf));
+	parabolas -= parabolas.w;
+	parabolas = max(parabolas, f16vec4(0.0hf));
+	return parabolas;
+}
+
+void main()
+{
+	uint num_blobs = uint(registers.num_blobs);
+
+	float     x      = float(gl_GlobalInvocationID.x) / float(WIDTH) - 0.5;
+	float     y      = float(gl_GlobalInvocationID.y) / float(HEIGHT) - 0.5;
+	f16vec2   pos    = f16vec2(x, y);
+	f16vec4   result = f16vec4(0.0hf);
+	float16_t seed   = float16_t(registers.seed);
+	ivec2     range  = ivec2(registers.range);
+
+	const float16_t EXPAND_FACTOR = 0.3hf;
+	float16_t       stride        = seed * EXPAND_FACTOR;
+
+	for (uint i = 0; i < num_blobs; i++)
+	{
+		f16vec4 blob = blob_data[i];
+
+		// Get as much mileage out of the buffer load as possible.
+		for (int y = -range.y; y <= range.y; y++)
+			for (int x = -range.x; x <= range.x; x++)
+				result += compute_blob(pos + stride * f16vec2(x, y), blob, seed);
+	}
+
+	imageStore(o_results, ivec2(gl_GlobalInvocationID.xy), result);
+}