blob: af751b77dd8ccc4c4a062887078e2cee003b42e4 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#define COMPILER_OVERFLOW_ADD(TYPE, FUNC) \
kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst) \
{ \
__global TYPE* A = &src0[get_global_id(0)]; \
__global TYPE* B = &src1[get_global_id(0)]; \
__global TYPE* C = &dst[get_global_id(0)]; \
*C = *A + *B; \
TYPE carry = -convert_##TYPE(*C < *B); \
\
(*C).y += carry.x; \
carry.y += ((*C).y < carry.x); \
(*C).z += carry.y; \
\
carry.z += ((*C).z < carry.y); \
(*C).w += carry.z; \
carry.w += ((*C).w < carry.z); \
}
COMPILER_OVERFLOW_ADD(ulong4, add)
COMPILER_OVERFLOW_ADD(uint4, add)
COMPILER_OVERFLOW_ADD(ushort4, add)
COMPILER_OVERFLOW_ADD(uchar4, add)
#define COMPILER_OVERFLOW_SUB(TYPE, FUNC) \
kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst) \
{ \
__global TYPE* A = &src0[get_global_id(0)]; \
__global TYPE* B = &src1[get_global_id(0)]; \
__global TYPE* C = &dst[get_global_id(0)]; \
TYPE borrow; \
unsigned result; \
size_t num = sizeof(*A)/sizeof((*A)[0]); \
for (uint i = 0; i < num; i++ ) {\
borrow[i] = __builtin_usub_overflow((*A)[i], (*B)[i], &result); \
(*C)[i] = result; \
}\
\
for (uint i = 0; i < num-1; i++ ) {\
borrow[i+1] += (*C)[i+1] < borrow[i];(*C)[i+1] -= borrow[i]; \
}\
\
}
COMPILER_OVERFLOW_SUB(uint4, sub)
|