arb_shader_image_load_store: add additional coherency test

The existing coherency test isn't a good match for the AMD GCN execution model.
author: Nicolai Hähnle <nicolai.haehnle@amd.com> 2016-03-12 21:21:56 -0500
committer: Nicolai Hähnle <nicolai.haehnle@amd.com> 2016-04-14 16:42:13 -0500
commit: 25a8564bea9d58a58fb3cfad842a7045d4edb66a (patch)
tree: 6218ec960cb40a0ec8e9c7076b3c04e97fb37585
parent: 41567ddfb67b1a665617d239e22d2e1f5d56087d (diff)
1 files changed, 90 insertions, 0 deletions
diff --git a/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test b/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test
new file mode 100644
index 000000000..f718cd2a6
--- /dev/null
+++ b/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test
@@ -0,0 +1,90 @@
+# Additional coherency test that can demonstrate failures in an incorrect
+# coherency implementation for AMD GCN, unlike arb_shader_image_load_store-coherency.
+#
+# The real problem with coherency in AMD GCN is separate, non-coherent L1
+# caches, i.e. when a shader execution writes to an image in a CU that uses
+# one L1 cache, and a different shader execution reads from the image
+# in a CU with a different L1 cache.
+#
+# This test uses atomic accesses to a control texture to select the very first
+# fragment shader thread as a writer thread which keeps changing a data
+# texture in a tight loop. All other threads become reader threads which
+# report success if they see two different values of the same texture.
+#
+# This test can produce a false negative (false failure) in two cases:
+#  1) The timeout value ITERS is too low,
+#  2) There is no (or insufficient) parallelism in the implementation, and
+#     therefore the writer thread must finish before most of the reader threads
+#     get a chance to run.
+#
+
+[require]
+GL >= 3.3
+GLSL >= 3.30
+GL_ARB_shader_image_load_store
+SIZE 256 256
+
+[vertex shader passthrough]
+
+[fragment shader]
+#version 330
+#extension GL_ARB_shader_image_load_store: enable
+
+// Change this to 0 to get a control test that should fail on hardware
+// without coherent L1 caches.
+//
+// Need volatile instead of just coherent to prevent overly smart compilers
+// from moving the imageLoad/imageStore out of the loop.
+#if 1
+volatile
+#endif
+layout(r32i) uniform iimage2D tex;
+volatile layout(r32i) uniform iimage2D ctrl;
+out vec4 outcolor;
+
+// Add a timeout so that an incorrect coherency implementation doesn't hang
+// the GPU. If this timeout is too low, you can get false negative results
+// because the writer thread quits before all reader threads have
+// executed.
+#define ITERS 100000
+
+void main()
+{
+	int id = imageAtomicAdd(ctrl, ivec2(0, 0), 1);
+	int orig = imageLoad(tex, ivec2(0, 0)).x;
+	bool done = false;
+
+	outcolor = vec4(0.0, 0.0, 0.0, 1.0);
+
+	for (int iter = 0; iter < ITERS && !done; ++iter) {
+		if (id == 0) {
+			imageStore(tex, ivec2(0, 0), ivec4(iter));
+			if (imageLoad(ctrl, ivec2(0, 1)).x >= 256 * 256)
+				done = true;
+		} else {
+			int current = imageLoad(tex, ivec2(0, 0)).x;
+			if (current != orig)
+				done = true;
+		}
+
+		if (done || (id == 0 && iter == 0))
+			imageAtomicAdd(ctrl, ivec2(0, 1), 1);
+	}
+
+	if (done)
+		outcolor.y = 1.0;
+	else
+		outcolor.x = 1.0;
+}
+
+[test]
+texture integer 0 (1, 2) (0, 0) GL_R32I
+image texture 0 GL_R32I
+texture integer 1 (1, 1) (0, 0) GL_R32I
+image texture 1 GL_R32I
+
+uniform int ctrl 0
+uniform int tex 1
+draw rect -1 -1 2 2
+
+probe all rgba 0.0 1.0 0.0 1.0
author	Nicolai Hähnle <nicolai.haehnle@amd.com>	2016-03-12 21:21:56 -0500
committer	Nicolai Hähnle <nicolai.haehnle@amd.com>	2016-04-14 16:42:13 -0500
commit	25a8564bea9d58a58fb3cfad842a7045d4edb66a (patch)
tree	6218ec960cb40a0ec8e9c7076b3c04e97fb37585
parent	41567ddfb67b1a665617d239e22d2e1f5d56087d (diff)