summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <arsenm2@gmail.com>2017-09-18 18:53:24 -0700
committerJan Vesely <jan.vesely@rutgers.edu>2017-09-20 12:59:10 -0400
commite408ce1f2bff23121670a8206258c80bb3d9befd (patch)
tree0772aafed8f269fa335685fa9d9ab41acf999cd6
parent7f1f0f40dca636301d740ab08b1de3df6c6206a5 (diff)
cl: Add tests for function calls
Passes on ROCm, I haven't tried clover recently. Last time I did it errored because the AsmParser wasn't properly initialized. v2: Fix non-unique test names, Wrap noinline in unguarded macro, use prettier test names, use device_regex (effectively restricting to ROCm) Reviewed-by: Jan Vesely <jan.vesely@rutgers.edu>
-rw-r--r--tests/cl/program/execute/call-clobbers-amdgcn.cl67
-rw-r--r--tests/cl/program/execute/calls-struct.cl179
-rw-r--r--tests/cl/program/execute/calls-workitem-id.cl77
-rw-r--r--tests/cl/program/execute/calls.cl607
-rw-r--r--tests/cl/program/execute/tail-calls.cl305
5 files changed, 1235 insertions, 0 deletions
diff --git a/tests/cl/program/execute/call-clobbers-amdgcn.cl b/tests/cl/program/execute/call-clobbers-amdgcn.cl
new file mode 100644
index 000000000..18e657ce3
--- /dev/null
+++ b/tests/cl/program/execute/call-clobbers-amdgcn.cl
@@ -0,0 +1,67 @@
+/*!
+
+[config]
+name: amdgcn call clobbers
+clc_version_min: 10
+device_regex: gfx[\d]*
+
+[test]
+name: callee saved sgpr
+kernel_name: call_clobber_s40
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 0xabcd1234
+
+[test]
+name: callee saved vgpr
+kernel_name: call_clobber_v40
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 0xabcd1234
+
+!*/
+
+#ifndef __AMDGCN__
+#error This test is only for amdgcn
+#endif
+
+__attribute__((noinline))
+void clobber_s40()
+{
+ __asm volatile("s_mov_b32 s40, 0xdead" : : : "s40");
+}
+
+kernel void call_clobber_s40(__global int* ret)
+{
+ __asm volatile("s_mov_b32 s40, 0xabcd1234" : : : "s40");
+
+ clobber_s40();
+
+ int tmp;
+
+ __asm volatile("v_mov_b32 %0, s40"
+ : "=v"(tmp)
+ :
+ : "s40");
+ *ret = tmp;
+}
+
+__attribute__((noinline))
+void clobber_v40()
+{
+ __asm volatile("v_mov_b32 v40, 0xdead" : : : "v40");
+}
+
+kernel void call_clobber_v40(__global int* ret)
+{
+ __asm volatile("v_mov_b32 v40, 0xabcd1234" : : : "v40");
+
+ clobber_v40();
+
+ int tmp;
+ __asm volatile("v_mov_b32 %0, v40"
+ : "=v"(tmp)
+ :
+ : "v40");
+ *ret = tmp;
+}
diff --git a/tests/cl/program/execute/calls-struct.cl b/tests/cl/program/execute/calls-struct.cl
new file mode 100644
index 000000000..04f769dac
--- /dev/null
+++ b/tests/cl/program/execute/calls-struct.cl
@@ -0,0 +1,179 @@
+/*!
+
+[config]
+name: calls with structs
+clc_version_min: 10
+
+[test]
+name: byval struct
+kernel_name: call_i32_func_byval_Char_IntArray
+dimensions: 1
+global_size: 16 0 0
+
+arg_out: 0 buffer int[16] \
+ 1021 1022 1023 1024 1025 1026 1027 1028 \
+ 1029 1030 1031 1032 1033 1034 1035 1036
+
+arg_out: 1 buffer int[16] \
+ 14 14 14 14 \
+ 14 14 14 14 \
+ 14 14 14 14 \
+ 14 14 14 14 \
+
+arg_in: 2 buffer int[16] \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+
+
+[test]
+name: sret struct
+kernel_name: call_sret_Char_IntArray_func
+dimensions: 1
+global_size: 16 0 0
+
+arg_out: 0 buffer int[16] \
+ 921 922 923 924 925 926 927 928 \
+ 929 930 931 932 933 934 935 936
+
+arg_in: 1 buffer int[16] \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+
+
+[test]
+name: byval struct and sret struct
+kernel_name: call_sret_Char_IntArray_func_byval_Char_IntArray
+dimensions: 1
+global_size: 16 0 0
+
+arg_out: 0 buffer int[16] \
+ 86 87 88 89 \
+ 90 91 92 93 \
+ 94 95 96 97 \
+ 98 99 100 101
+
+arg_out: 1 buffer int[16] \
+ 134 135 136 137 \
+ 138 139 140 141 \
+ 142 143 144 145 \
+ 146 147 148 149
+
+arg_in: 2 buffer int[16] \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+
+!*/
+
+#define NOINLINE __attribute__((noinline))
+
+typedef struct ByVal_Char_IntArray {
+ char c;
+ int i[4];
+} ByVal_Char_IntArray;
+
+NOINLINE
+int i32_func_byval_Char_IntArray(ByVal_Char_IntArray st)
+{
+ st.i[0] += 100;
+
+ int sum = 0;
+ for (int i = 0; i < 4; ++i)
+ {
+ sum += st.i[i];
+ }
+
+ sum += st.c;
+ return sum;
+}
+
+kernel void call_i32_func_byval_Char_IntArray(global int* out0,
+ global int* out1,
+ global int* input)
+{
+ ByVal_Char_IntArray st;
+ st.c = 15;
+
+ int id = get_global_id(0);
+
+ int val = input[id];
+ st.i[0] = 14;
+ st.i[1] = -8;
+ st.i[2] = val;
+ st.i[3] = 900;
+
+ int result = i32_func_byval_Char_IntArray(st);
+ out0[id] = result;
+ out1[id] = st.i[0];
+}
+
+NOINLINE
+ByVal_Char_IntArray sret_Char_IntArray_func(global int* input, int id)
+{
+ ByVal_Char_IntArray st;
+ st.c = 15;
+
+ int val = input[id];
+ st.i[0] = 14;
+ st.i[1] = -8;
+ st.i[2] = val;
+ st.i[3] = 900;
+
+ return st;
+}
+
+kernel void call_sret_Char_IntArray_func(global int* output, global int* input)
+{
+ int id = get_global_id(0);
+ ByVal_Char_IntArray st = sret_Char_IntArray_func(input, id);
+
+ int sum = 0;
+ for (int i = 0; i < 4; ++i)
+ {
+ sum += st.i[i];
+ }
+
+ sum += st.c;
+ output[id] = sum;
+}
+
+NOINLINE
+ByVal_Char_IntArray sret_Char_IntArray_func_byval_Char_IntArray(ByVal_Char_IntArray st)
+{
+ st.c += 15;
+
+ st.i[0] += 14;
+ st.i[1] -= 8;
+ st.i[2] += 9;
+ st.i[3] += 18;
+
+ return st;
+}
+
+kernel void call_sret_Char_IntArray_func_byval_Char_IntArray(global int* output0,
+ global int* output1,
+ global int* input)
+{
+ int id = get_global_id(0);
+
+ volatile ByVal_Char_IntArray st0;
+ st0.c = -20;
+
+ int val = input[id];
+ st0.i[0] = 14;
+ st0.i[1] = -8;
+ st0.i[2] = val;
+ st0.i[3] = 100;
+
+ ByVal_Char_IntArray st1 = sret_Char_IntArray_func_byval_Char_IntArray(st0);
+
+ int sum0 = 0;
+ int sum1 = 0;
+ for (int i = 0; i < 4; ++i)
+ {
+ sum0 += st0.i[i];
+ sum1 += st1.i[i];
+ }
+
+ sum0 += st0.c;
+ sum1 += st1.c;
+
+ output0[id] = sum0;
+ output1[id] = sum1;
+}
diff --git a/tests/cl/program/execute/calls-workitem-id.cl b/tests/cl/program/execute/calls-workitem-id.cl
new file mode 100644
index 000000000..7edfad7e9
--- /dev/null
+++ b/tests/cl/program/execute/calls-workitem-id.cl
@@ -0,0 +1,77 @@
+/*!
+
+[config]
+name: calls workitem IDs
+clc_version_min: 10
+
+[test]
+name: Callee function use get_global_id(0)
+kernel_name: kernel_call_pass_get_global_id_0
+dimensions: 1
+global_size: 64 0 0
+arg_out: 0 buffer uint[64] \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 \
+ 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 \
+ 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 \
+ 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
+
+[test]
+name: Callee function use get_global_id 0..2
+kernel_name: kernel_call_pass_get_global_id_012
+dimensions: 3
+global_size: 8 4 2
+arg_out: 0 buffer uint[64] \
+ 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 \
+ 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 \
+ 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 \
+ 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+
+arg_out: 1 buffer uint[64] \
+ 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 \
+ 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 \
+ 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 \
+ 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
+
+arg_out: 2 buffer uint[64] \
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+
+!*/
+
+#define NOINLINE __attribute__((noinline))
+
+NOINLINE
+void func_get_global_id_0(volatile global uint* out)
+{
+ uint gid = get_global_id(0);
+ out[gid] = gid;
+}
+
+kernel void kernel_call_pass_get_global_id_0(global uint *out)
+{
+ func_get_global_id_0(out);
+}
+
+NOINLINE
+void func_get_global_id_012(volatile global uint* out0,
+ volatile global uint* out1,
+ volatile global uint* out2)
+{
+ uint id0 = get_global_id(0);
+ uint id1 = get_global_id(1);
+ uint id2 = get_global_id(2);
+ uint flat_id = (id2 * get_global_size(1) + id1) * get_global_size(0) + id0;
+
+ out0[flat_id] = id0;
+ out1[flat_id] = id1;
+ out2[flat_id] = id2;
+}
+
+kernel void kernel_call_pass_get_global_id_012(global uint *out0,
+ global uint *out1,
+ global uint *out2)
+{
+ func_get_global_id_012(out0, out1, out2);
+}
diff --git a/tests/cl/program/execute/calls.cl b/tests/cl/program/execute/calls.cl
new file mode 100644
index 000000000..f4f55be31
--- /dev/null
+++ b/tests/cl/program/execute/calls.cl
@@ -0,0 +1,607 @@
+/*!
+
+[config]
+name: calls
+clc_version_min: 10
+
+[test]
+name: Call void_func_void
+kernel_name: call_void_func_void
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 12345
+
+[test]
+name: Call i32_func_void
+kernel_name: call_i32_func_void
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 0x12345
+
+[test]
+name: Call i64_func_void
+kernel_name: call_i64_func_void
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer long[1] 0x100000000000
+
+
+[test]
+name: Call call_i32_func_void_callee_stack
+kernel_name: call_i32_func_void_callee_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 290
+
+[test]
+name: Call call_i32_func_p0i32_i32_caller_stack
+kernel_name: call_i32_func_p0i32_i32_caller_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 175
+
+[test]
+name: Call call_i32_func_p0i32_i32_indirect_kernel_stack
+kernel_name: call_i32_func_p0i32_i32_indirect_kernel_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 241
+
+[test]
+name: Call call_i32_func_p0i32_i32_indirect_function_stack
+kernel_name: call_i32_func_p0i32_i32_indirect_function_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 291
+
+[test]
+name: callee stack corruption
+kernel_name: kernel_call_nested_stack_usage
+dimensions: 1
+global_size: 10 0 0
+
+arg_out: 0 buffer int4[10] \
+ 53 48 156 160 \
+ 84 248 102 150 \
+ 102 56 217 106 \
+ 100 123 151 139 \
+ 80 150 135 163 \
+ 223 99 117 199 \
+ 187 262 223 169 \
+ 277 129 73 121 \
+ 162 165 138 137 \
+ 204 207 223 145 \
+
+
+arg_in: 1 buffer int4[10] \
+ 0 13 76 46 \
+ 4 74 33 63 \
+ 26 9 95 7 \
+ 41 54 47 29 \
+ 15 68 38 39 \
+ 91 43 14 95 \
+ 44 83 69 70 \
+ 89 54 14 45 \
+ 77 63 21 21 \
+ 64 70 80 70
+
+arg_in: 2 buffer int4[10] \
+ 53 22 4 68 \
+ 76 100 36 24 \
+ 50 38 27 92 \
+ 18 15 57 81 \
+ 50 14 59 85 \
+ 41 13 89 9 \
+ 99 96 85 29 \
+ 99 21 45 31 \
+ 8 39 96 95 \
+ 76 67 63 5
+
+[test]
+name: nested calls
+kernel_name: kernel_nested_calls
+dimensions: 1
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+ 1 7 155 -4
+
+arg_in: 1 buffer int[4] \
+ 0 100 1234 -912
+
+arg_in: 2 buffer int[4] \
+ 1 4 2 45
+
+
+[test]
+name: Kernel call stack argument
+kernel_name: kernel_call_stack_arg
+dimensions: 1
+global_size: 10 0 0
+
+
+arg_out: 0 buffer int4[10] \
+ 11440 1348 29304 16698 \
+ 47975 3626 30850 13224 \
+ 8235 30495 31995 1455 \
+ 16048 40512 33992 7028 \
+ 9450 5356 21330 23130 \
+ 21120 35186 52896 49968 \
+ 34083 28520 0 0 \
+ 12384 41492 4420 17880 \
+ 37310 19320 37518 13175 \
+ 23852 16014 22734 24284 \
+
+
+arg_in: 1 buffer int4[10] \
+ 0 13 76 46 \
+ 63 76 100 36 \
+ 27 92 53 46 \
+ 53 50 96 75 \
+ 99 41 14 57 \
+ 35 45 81 94 \
+ 80 71 74 1 \
+ 78 73 32 42 \
+ 60 17 83 15 \
+ 13 53 31 59
+
+arg_in: 2 buffer int4[10] \
+ 53 22 4 68 \
+ 24 99 72 76 \
+ 95 5 76 77 \
+ 56 89 63 85 \
+ 25 49 46 97 \
+ 65 21 68 91 \
+ 89 53 46 6 \
+ 68 68 20 84 \
+ 99 25 23 10 \
+ 52 43 26 37
+
+arg_in: 3 buffer int4[10] \
+ 68 94 38 52 \
+ 65 7 63 89 \
+ 83 12 1 69 \
+ 16 21 72 13 \
+ 12 20 32 63 \
+ 25 86 47 51 \
+ 72 49 67 68 \
+ 71 83 9 8 \
+ 22 64 70 80 \
+ 39 45 48 39
+
+arg_in: 4 buffer int4[10] \
+ 83 3 5 53 \
+ 27 44 77 48 \
+ 87 63 74 73 \
+ 9 27 0 41 \
+ 12 65 62 81 \
+ 60 82 76 46 \
+ 20 92 87 89 \
+ 77 63 21 21 \
+ 70 76 67 63 \
+ 28 7 37 25
+
+arg_in: 5 buffer int4[10] \
+ 67 0 38 6 \
+ 24 27 36 16 \
+ 100 89 23 30 \
+ 2 71 94 24 \
+ 25 48 39 20 \
+ 96 63 44 83 \
+ 54 14 45 99 \
+ 8 39 96 95 \
+ 5 60 22 32 \
+ 67 68 51 73
+
+arg_in: 6 buffer int4[10] \
+ 42 69 59 93 \
+ 49 90 91 6 \
+ 35 51 59 85 \
+ 18 32 89 65 \
+ 2 91 43 14 \
+ 69 70 99 96 \
+ 21 45 31 51 \
+ 39 27 69 28 \
+ 70 11 77 53 \
+ 72 95 46 94
+
+arg_in: 7 buffer int4[10] \
+ 85 53 9 66 \
+ 91 50 52 32 \
+ 41 84 27 41 \
+ 15 68 38 39 \
+ 95 41 13 89 \
+ 85 29 54 51 \
+ 89 44 47 81 \
+ 78 79 42 28 \
+ 55 59 33 71 \
+ 32 46 52 66
+
+arg_in: 8 buffer int4[10] \
+ 42 70 91 76 \
+ 99 49 26 9 \
+ 54 47 29 18 \
+ 50 14 59 85 \
+ 9 16 7 36 \
+ 10 41 58 88 \
+ 36 21 100 15 \
+ 19 1 19 99 \
+ 14 16 49 86 \
+ 40 61 99 15
+
+arg_in: 9 buffer int4[10] \
+ 26 4 74 33 \
+ 95 7 50 38 \
+ 15 57 81 3 \
+ 59 96 56 14 \
+ 25 13 79 45 \
+ 44 73 87 72 \
+ 63 62 0 0 \
+ 24 82 13 40 \
+ 82 56 74 31 \
+ 67 34 54 52
+
+!*/
+
+// The inline asm is necessary to defeat interprocedural sparse
+// conditional constant propagation eliminating some of the trivial
+// calls.
+#ifdef __AMDGCN__
+#define USE_ASM 1
+#endif
+
+#define NOINLINE __attribute__((noinline))
+
+NOINLINE
+void void_func_void(void)
+{
+#if USE_ASM
+ __asm("");
+#endif
+}
+
+kernel void call_void_func_void(__global int* ret)
+{
+ void_func_void();
+ *ret = 12345;
+}
+
+NOINLINE
+int i32_func_void(void)
+{
+ int ret;
+#if USE_ASM
+ __asm("v_mov_b32 %0, 0x12345" : "=v"(ret));
+#else
+ ret = 0x12345;
+#endif
+
+ return ret;
+}
+
+kernel void call_i32_func_void(__global int* ret)
+{
+ *ret = i32_func_void();
+}
+
+NOINLINE
+long i64_func_void(void)
+{
+ long ret;
+#if USE_ASM
+ __asm("v_lshlrev_b64 %0, 44, 1" : "=v"(ret));
+#else
+ ret = 1ull << 44;
+#endif
+ return ret;
+}
+
+kernel void call_i64_func_void(__global long* ret)
+{
+ *ret = i64_func_void();
+}
+
+
+NOINLINE
+int i32_func_void_callee_stack(void)
+{
+ int ret;
+#if USE_ASM
+ __asm("v_mov_b32 %0, 0x64" : "=v"(ret));
+#else
+ ret = 0x64;
+#endif
+
+ volatile int alloca[20];
+
+ for (int i = 0; i < 20; ++i)
+ {
+ alloca[i] = i;
+ }
+
+ for (int i = 0; i < 20; ++i)
+ {
+ ret += alloca[i];
+ }
+
+ return ret;
+}
+
+kernel void call_i32_func_void_callee_stack(__global int* ret)
+{
+ volatile int alloca[10];
+
+ for (int i = 0; i < 10; ++i)
+ {
+ alloca[i] = 0xffff;
+ }
+
+
+ *ret = i32_func_void_callee_stack();
+}
+
+NOINLINE
+int i32_func_p0i32_i32_caller_stack(volatile int* stack, int n)
+{
+ int ret;
+#if USE_ASM
+ __asm("v_mov_b32 %0, 0x64" : "=v"(ret));
+#else
+ ret = 0x64;
+#endif
+
+ for (int i = 0; i < n; ++i)
+ {
+ ret += stack[i];
+ }
+
+ return ret;
+}
+
+kernel void call_i32_func_p0i32_i32_caller_stack(__global int* ret)
+{
+ volatile int alloca[10];
+
+ for (int i = 0; i < 10; ++i)
+ {
+ alloca[i] = 3 + i;
+ }
+
+ *ret = i32_func_p0i32_i32_caller_stack(alloca, 10);
+}
+
+NOINLINE
+int i32_func_p0i32_i32_indirect_stack(volatile int* stack, int n)
+{
+ int ret;
+#if USE_ASM
+ __asm("v_mov_b32 %0, 0x64" : "=v"(ret));
+#else
+ ret = 0x64;
+#endif
+ for (int i = 0; i < n; ++i)
+ {
+ ret += stack[i];
+ }
+
+ return ret;
+}
+
+// Access a stack object in the parent kernel's frame.
+NOINLINE
+int i32_func_p0i32_i32_pass_kernel_stack(volatile int* stack, int n)
+{
+ int ret;
+#if USE_ASM
+ __asm("v_mov_b32 %0, 0x42" : "=v"(ret));
+#else
+ ret = 0x42;
+#endif
+
+ volatile int local_object[10];
+ for (int i = 0; i < 10; ++i)
+ local_object[i] = -1;
+
+ ret += i32_func_p0i32_i32_indirect_stack(stack, n);
+
+ return ret;
+}
+
+kernel void call_i32_func_p0i32_i32_indirect_kernel_stack(volatile __global int* ret)
+{
+ volatile int alloca[10];
+
+ for (int i = 0; i < 10; ++i)
+ {
+ alloca[i] = 3 + i;
+ }
+
+ *ret = i32_func_p0i32_i32_pass_kernel_stack(alloca, 10);
+}
+
+// Access a stack object in a parent non-kernel function's stack frame.
+NOINLINE
+int i32_func_void_pass_function_stack()
+{
+ int ret;
+#if USE_ASM
+ __asm("v_mov_b32 %0, 0x42" : "=v"(ret));
+#else
+ ret = 0x42;
+#endif
+
+ volatile int local_object[10];
+ for (int i = 0; i < 10; ++i)
+ local_object[i] = 8 + i;
+
+ ret += i32_func_p0i32_i32_indirect_stack(local_object, 10);
+ return ret;
+}
+
+kernel void call_i32_func_p0i32_i32_indirect_function_stack(__global int* ret)
+{
+ *ret = i32_func_void_pass_function_stack();
+}
+
+NOINLINE
+int4 v4i32_func_v4i32_v4i32_stack(int4 arg0, int4 arg1)
+{
+ // Force stack usage.
+ volatile int4 args[8] = { arg0, arg1 };
+
+ int4 total = 0;
+ for (int i = 0; i < 8; ++i)
+ {
+ total += args[i];
+ }
+
+ return total;
+}
+
+// Make sure using stack in a callee function from a callee function
+// doesn't corrupt caller's stack objects.
+NOINLINE
+int4 nested_stack_usage_v4i32_func_v4i32_v4i32(int4 arg0, int4 arg1)
+{
+ volatile int stack_object[4];
+ for (int i = 0; i < 4; ++i) {
+ const int test_val = 0x04030200 | i;
+ stack_object[i] = test_val;
+ }
+
+ arg0 *= 2;
+
+ int4 result = v4i32_func_v4i32_v4i32_stack(arg0, arg1);
+
+ // Check for stack corruption
+ for (int i = 0; i < 4; ++i)
+ {
+ const int test_val = 0x04030200 | i;
+ if (stack_object[i] != test_val)
+ result = -1;
+ }
+
+ return result;
+}
+
+kernel void kernel_call_nested_stack_usage(global int4* output,
+ global int4* input0,
+ global int4* input1)
+{
+ int id = get_global_id(0);
+ output[id] = nested_stack_usage_v4i32_func_v4i32_v4i32(
+ input0[id],
+ input1[id]);
+}
+
+NOINLINE
+int func_div_add(int x, int y)
+{
+ return x / y + 4;
+}
+
+NOINLINE
+int call_i32_func_i32_i32(int x, int y, volatile int* ptr)
+{
+ int tmp = func_div_add(x, y) >> 2;
+ return tmp + *ptr;
+}
+
+kernel void kernel_nested_calls(global int* output,
+ global int* input0,
+ global int* input1)
+{
+ int id = get_global_id(0);
+ volatile int zero = 0;
+ output[id] = call_i32_func_i32_i32(input0[id], input1[id], &zero);
+}
+
+NOINLINE
+int4 v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+ int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+ int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+ int4 arg8)
+{
+ // Try to make sure we can't clobber the incoming stack arguments
+ // with local stack objects.
+ volatile int4 args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 };
+ volatile int4 last_arg = arg8;
+
+ int4 total = 0;
+ for (int i = 0; i < 8; ++i)
+ {
+ total += args[i];
+ }
+
+ return total * last_arg;
+}
+
+ // Test argument passed on stack, but doesn't use byval.
+NOINLINE
+int4 stack_arg_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+ int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+ int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+ int4 arg8)
+{
+ volatile int stack_object[8];
+ for (int i = 0; i < 8; ++i) {
+ const int test_val = 0x04030200 | i;
+ stack_object[i] = test_val;
+ }
+
+ arg0 *= 2;
+
+ int4 result = v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+ arg0, arg1, arg2, arg3, arg4,
+ arg5, arg6, arg7, arg8);
+
+ // Check for stack corruption.
+ for (int i = 0; i < 8; ++i)
+ {
+ const int test_val = 0x04030200 | i;
+ if (stack_object[i] != test_val)
+ result = -1;
+ }
+
+ return result;
+}
+
+kernel void kernel_call_stack_arg(global int4* output,
+ global int4* input0,
+ global int4* input1,
+ global int4* input2,
+ global int4* input3,
+ global int4* input4,
+ global int4* input5,
+ global int4* input6,
+ global int4* input7,
+ global int4* input8)
+{
+ int id = get_global_id(0);
+
+ volatile int stack_object[8];
+ for (int i = 0; i < 8; ++i) {
+ const int test_val = 0x05060700 | i;
+ stack_object[i] = test_val;
+ }
+
+ output[id] = stack_arg_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+ input0[id],
+ input1[id],
+ input2[id],
+ input3[id],
+ input4[id],
+ input5[id],
+ input6[id],
+ input7[id],
+ input8[id]);
+
+ // Check for stack corruption.
+ for (int i = 0; i < 8; ++i)
+ {
+ const int test_val = 0x05060700 | i;
+ if (stack_object[i] != test_val)
+ output[id] = -1;
+ }
+
+}
diff --git a/tests/cl/program/execute/tail-calls.cl b/tests/cl/program/execute/tail-calls.cl
new file mode 100644
index 000000000..cf33373cc
--- /dev/null
+++ b/tests/cl/program/execute/tail-calls.cl
@@ -0,0 +1,305 @@
+/*!
+
+[config]
+name: tail calls
+clc_version_min: 10
+dimensions: 1
+
+[test]
+name: Basic tail call
+kernel_name: kernel_call_tailcall
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+ 4 11 107 -12
+
+arg_in: 1 buffer int[4] \
+ 0 100 1234 -912
+
+arg_in: 2 buffer int[4] \
+ 1 4 2 45
+
+[test]
+name: Tail call with more arguments than caller
+kernel_name: kernel_call_tailcall_extra_arg
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+ 2 112 1340 -882
+
+arg_in: 1 buffer int[4] \
+ 0 100 1234 -912
+
+arg_in: 2 buffer int[4] \
+ 1 4 2 45
+
+[test]
+name: Tail call with fewer arguments than acller
+kernel_name: kernel_call_tailcall_fewer_args
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+ 4 8 81 -10
+
+arg_in: 1 buffer int[4] \
+ 0 100 1234 -912
+
+arg_in: 2 buffer int[4] \
+ 1 4 2 45
+
+arg_in: 3 buffer int[4] \
+ 3 8 4 9
+
+[test]
+name: Tail call with stack passed argument
+kernel_name: kernel_call_tailcall_stack_passed_args
+global_size: 10 0 0
+
+arg_out: 0 buffer int4[10] \
+ 11440 8762 10296 13156 \
+ 19649 31311 18081 24745 \
+ 10476 11772 17766 11070 \
+ 22165 18005 28665 35945 \
+ 624 938 768 990 \
+ 30618 28791 30240 31815 \
+ 49851 47676 46806 47676 \
+ 4400 4272 3392 2632 \
+ 10582 8712 8514 7854 \
+ 19737 21199 23865 18533 \
+
+
+arg_in: 1 buffer int4[10] \
+ 0 13 76 46 \
+ 4 74 33 63 \
+ 26 9 95 7 \
+ 41 54 47 29 \
+ 15 68 38 39 \
+ 91 43 14 95 \
+ 44 83 69 70 \
+ 89 54 14 45 \
+ 77 63 21 21 \
+ 64 70 80 70
+
+arg_in: 2 buffer int4[10] \
+ 53 22 4 68 \
+ 76 100 36 24 \
+ 50 38 27 92 \
+ 18 15 57 81 \
+ 50 14 59 85 \
+ 41 13 89 9 \
+ 99 96 85 29 \
+ 99 21 45 31 \
+ 8 39 96 95 \
+ 76 67 63 5
+
+arg_in: 3 buffer int4[10] \
+ 68 94 38 52 \
+ 99 72 76 65 \
+ 53 46 95 5 \
+ 3 53 50 96 \
+ 59 96 56 14 \
+ 16 7 36 25 \
+ 54 51 10 41 \
+ 51 89 44 47 \
+ 39 27 69 28 \
+ 60 22 32 70
+
+arg_in: 4 buffer int4[10] \
+ 83 3 5 53 \
+ 7 63 89 27 \
+ 76 77 83 12 \
+ 75 56 89 63 \
+ 99 41 14 57 \
+ 13 79 45 35 \
+ 58 88 44 73 \
+ 81 36 21 100 \
+ 78 79 42 28 \
+ 11 77 53 55
+
+arg_in: 5 buffer int4[10] \
+ 67 0 38 6 \
+ 44 77 48 24 \
+ 1 69 87 63 \
+ 85 16 21 72 \
+ 25 49 46 97 \
+ 45 81 94 65 \
+ 87 72 80 71 \
+ 15 63 62 0 \
+ 19 1 19 99 \
+ 59 33 71 14
+
+arg_in: 6 buffer int4[10] \
+ 42 69 59 93 \
+ 27 36 16 49 \
+ 74 73 100 89 \
+ 13 9 27 0 \
+ 12 20 32 63 \
+ 21 68 91 25 \
+ 74 1 89 53 \
+ 0 78 73 32 \
+ 24 82 13 40 \
+ 16 49 86 82
+
+arg_in: 7 buffer int4[10] \
+ 85 53 9 66 \
+ 90 91 6 91 \
+ 23 30 35 51 \
+ 41 2 71 94 \
+ 12 65 62 81 \
+ 86 47 51 60 \
+ 46 6 72 49 \
+ 42 68 68 20 \
+ 60 17 83 15 \
+ 56 74 31 13
+
+arg_in: 8 buffer int4[10] \
+ 42 70 91 76 \
+ 50 52 32 99 \
+ 59 85 41 84 \
+ 24 18 32 89 \
+ 25 48 39 20 \
+ 82 76 46 96 \
+ 67 68 20 92 \
+ 84 71 83 9 \
+ 99 25 23 10 \
+ 53 31 59 52
+
+arg_in: 9 buffer int[10] \
+ 26 \
+ 49 \
+ 27 \
+ 65 \
+ 2 \
+ 63 \
+ 87 \
+ 8 \
+ 22 \
+ 43
+
+!*/
+
+#define NOINLINE __attribute__((noinline))
+
+NOINLINE
+int i32_func_i32_i32(int x, int y)
+{
+ return x / y + 4;
+}
+
+NOINLINE
+int i32_func_i32_i32_i32(int x, int y, int z)
+{
+ return x / y + z;
+}
+
+// Test a basic tail call
+NOINLINE
+int tailcall_i32_func_i32_i32(int x, int y)
+{
+ x += 5;
+ y += 10;
+ return i32_func_i32_i32(x, y);
+}
+
+// Test a basic tail call with more arguments in the callee than
+// caller.
+NOINLINE
+int tailcall_i32_func_i32_i32_extra_arg(int x, int y)
+{
+ int z = x + y + 1;
+ x += 5;
+ y += 10;
+ return i32_func_i32_i32_i32(x, y, z);
+}
+
+// Test a basic tail call with fewere arguments in the callee than
+// caller.
+NOINLINE
+int tailcall_i32_func_i32_i32_i32_fewer_args(int x, int y, int z)
+{
+ x += 5;
+ y += 10;
+ return i32_func_i32_i32(x, y + z);
+}
+
+kernel void kernel_call_tailcall(global int* output,
+ global int* input0,
+ global int* input1)
+{
+ int id = get_global_id(0);
+ output[id] = tailcall_i32_func_i32_i32(input0[id], input1[id]);
+}
+
+kernel void kernel_call_tailcall_extra_arg(global int* output,
+ global int* input0,
+ global int* input1)
+{
+ int id = get_global_id(0);
+ output[id] = tailcall_i32_func_i32_i32_extra_arg(input0[id], input1[id]);
+}
+
+kernel void kernel_call_tailcall_fewer_args(global int* output,
+ global int* input0,
+ global int* input1,
+ global int* input2)
+{
+ int id = get_global_id(0);
+ output[id] = tailcall_i32_func_i32_i32_i32_fewer_args(input0[id], input1[id], input2[id]);
+}
+
+NOINLINE
+int4 v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+ int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+ int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+ int arg8)
+{
+ // Try to make sure we can't clobber the incoming stack arguments
+ // with local stack objects.
+ volatile int4 args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 };
+ volatile int scalar_arg = arg8;
+
+ int4 total = 0;
+ for (int i = 0; i < 8; ++i)
+ {
+ total += args[i];
+ }
+
+ return total * scalar_arg;
+}
+
+// Test a basic tail call
+NOINLINE
+int4 tailcall_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+ int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+ int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+ int arg8)
+{
+ arg0 *= 2;
+ return v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+ arg0, arg1, arg2, arg3, arg4,
+ arg5, arg6, arg7, arg8);
+}
+
+kernel void kernel_call_tailcall_stack_passed_args(global int4* output,
+ global int4* input0,
+ global int4* input1,
+ global int4* input2,
+ global int4* input3,
+ global int4* input4,
+ global int4* input5,
+ global int4* input6,
+ global int4* input7,
+ global int* input8)
+{
+ int id = get_global_id(0);
+ output[id] = tailcall_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+ input0[id],
+ input1[id],
+ input2[id],
+ input3[id],
+ input4[id],
+ input5[id],
+ input6[id],
+ input7[id],
+ input8[id]);
+}