diff options
author | Ruiling Song <ruiling.song@intel.com> | 2015-02-15 16:09:07 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2015-02-25 16:35:59 +0800 |
commit | dd4874bb3e4909225a36660360dfb8e2dcdc0d47 (patch) | |
tree | 1b5847d4fb58c59b699c38f52605ae2883a03b8f | |
parent | e276fc4c5d7e1cda4ce9fa9397501dba75c527ed (diff) |
libocl: Directly scalarize built-in with vector input.
This revert the following commit:
"Re-apply "improve the build performance of vector type built-in function.""
commitId: 06cce8178649759e12a3a353f0550189d371871b.
I finally decide to do this because although below kind of program has less
instructions and less compile-time, but it will also introduce extra memory access,
which would cause bad run-time performance if the loop is not unrolled. If the loop
is unrolled, it would be similar like scalarized version.
OVERLOADABLE float16 func (float16 param0)
{
union{
float va[16];
float16 vv16;
}uret;
union{
float pa[16];
float16 pv16;
}usrc0;
usrc0.pv16 = param0;
for(int i =0; i < 16; i++)
uret.va[i] = func(usrc0.pa[i]);
return uret.vv16;
}
I did some experiment on the affected built-in. I fixed the GPU frequency at 1050,
and increase input data to 862000. The result is like below (obviously the scalarized
version has better performance):
bultin_asinh_float16:
loop version: 200ms
scalarized version: 150ms
builtin_sinh_float16:
loop version: 250ms
scalarized version: 160ms
And also this patch would reduce the generation of large integer. Although we support
large integer legalization, I find sometime it is hard to legalize in very efficient way
like large integer LE/GT.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
-rwxr-xr-x | backend/src/libocl/script/gen_vector.py | 45 |
1 files changed, 6 insertions, 39 deletions
diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py index 291dd879..ffc573aa 100755 --- a/backend/src/libocl/script/gen_vector.py +++ b/backend/src/libocl/script/gen_vector.py @@ -289,42 +289,9 @@ class builtinProto(): formatStr += ';' self.append(formatStr) return formatStr - if self.functionName != 'select' and ptypeSeqs[0] == ptypeSeqs[self.paramCount-1] and ptype[1] > 4: - formatStr += '\n{ \n union{' - formatStr = self.append(formatStr, ' {0} va[{1}];'.format(vtype[0], vtype[1])) - formatStr = self.append(formatStr, ' {0}{1} vv{2};'.format(vtype[0], vtype[1], vtype[1])) - formatStr += '\n }uret;' - formatStr += '\n union{' - formatStr = self.append(formatStr, ' {0} pa[{1}];'.format(ptype[0], ptype[1])) - formatStr = self.append(formatStr, ' {0}{1} pv{2};'.format(ptype[0], ptype[1], ptype[1])) - formatStr += '\n }' - for n in range(0, self.paramCount): - formatStr += 'usrc{0}'.format(n) - if n+1 != self.paramCount: - formatStr +=', ' - formatStr += ';' - - for n in range(0, self.paramCount): - formatStr = self.append(formatStr, ' usrc{0}.pv{1} = param{2};'.format(n, ptype[1], n)) - formatStr = self.append(formatStr, ' for(int i =0; i < {0}; i++)'.format(ptype[1])) - formatStr += '\n uret.va[i] = ' - if self.prefix == 'relational' and self.functionName != 'bitselect' and self.functionName != 'select': - formatStr += '-' - formatStr += '{0}('.format(self.functionName) - - for n in range(0, self.paramCount): - formatStr += 'usrc{0}.pa[i]'.format(n) - if n+1 != self.paramCount: - formatStr +=', ' - formatStr += ');' - formatStr = self.append(formatStr, ' return uret.vv{0};'.format(vtype[1])) - formatStr += '\n}' - formatStr = self.append(formatStr) - return formatStr - else: - formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1])) - self.indent = len(formatStr) - for j in range(0, vtype[1]): + formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1])) + self.indent = len(formatStr) + for j in range(0, vtype[1]): if (j != 0): formatStr += ',' if (j + 1) % 2 == 0: @@ -359,10 +326,10 @@ class builtinProto(): formatStr += ')' - formatStr += '); }\n' - self.append(formatStr) + formatStr += '); }\n' + self.append(formatStr) - return formatStr + return formatStr def output(self): for line in self.outputStr: |