1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
/*
According to the OpenCL cl_intel_subgroups.
Now define local and global size as following:
globals[0] = 4;
globals[1] = 9;
globals[2] = 16;
locals[0] = 2;
locals[1] = 3;
locals[2] = 4;
*/
#define udebug 0
#include "utest_helper.hpp"
static void builtin_sub_group_size(void)
{
if(!cl_check_subgroups())
return;
// Setup kernel and buffers
size_t dim, i,local_sz = 1,buf_len = 1;
OCL_CREATE_KERNEL("builtin_sub_group_size");
size_t max_sub_sz;
OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
for( dim=1; dim <= 3; dim++ )
{
buf_len = 1;
local_sz = 1;
for(i=1; i <= dim; i++)
{
locals[i - 1] = i + 1;
globals[i - 1] = (i + 1) * (i + 1);
buf_len *= ((i + 1) * (i + 1));
local_sz *= i + 1;
}
for(i = dim+1; i <= 3; i++)
{
globals[i - 1] = 0;
locals[i - 1] = 0;
}
OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*dim,locals,sizeof(size_t),&max_sub_sz,NULL);
// Run the kernel
OCL_NDRANGE( dim );
clFinish(queue);
OCL_MAP_BUFFER(0);
for( i = 0; i < buf_len; i++) {
size_t expect_sz = (i % local_sz) < (local_sz / max_sub_sz * max_sub_sz) ? max_sub_sz : (local_sz % max_sub_sz);
#if udebug
printf("%zu get %d, expect %zu\n",i, ((uint32_t*)buf_data[0])[i], expect_sz);
#endif
OCL_ASSERT( ((uint32_t*)buf_data[0])[i] == expect_sz);
}
OCL_UNMAP_BUFFER(0);
}
}
MAKE_UTEST_FROM_FUNCTION(builtin_sub_group_size);
|