libocl: Directly scalarize built-in with vector input.

This revert the following commit: "Re-apply "improve the build performance of vector type built-in function."" commitId: 06cce8178649759e12a3a353f0550189d371871b. I finally decide to do this because although below kind of program has less instructions and less compile-time, but it will also introduce extra memory access, which would cause bad run-time performance if the loop is not unrolled. If the loop is unrolled, it would be similar like scalarized version. OVERLOADABLE float16 func (float16 param0) { union{ float va[16]; float16 vv16; }uret; union{ float pa[16]; float16 pv16; }usrc0; usrc0.pv16 = param0; for(int i =0; i < 16; i++) uret.va[i] = func(usrc0.pa[i]); return uret.vv16; } I did some experiment on the affected built-in. I fixed the GPU frequency at 1050, and increase input data to 862000. The result is like below (obviously the scalarized version has better performance): bultin_asinh_float16: loop version: 200ms scalarized version: 150ms builtin_sinh_float16: loop version: 250ms scalarized version: 160ms And also this patch would reduce the generation of large integer. Although we support large integer legalization, I find sometime it is hard to legalize in very efficient way like large integer LE/GT. Signed-off-by: Ruiling Song <ruiling.song@intel.com> Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
author: Ruiling Song <ruiling.song@intel.com> 2015-02-15 16:09:07 +0800
committer: Zhigang Gong <zhigang.gong@intel.com> 2015-02-25 16:35:59 +0800
commit: dd4874bb3e4909225a36660360dfb8e2dcdc0d47 (patch)
tree: 1b5847d4fb58c59b699c38f52605ae2883a03b8f
parent: e276fc4c5d7e1cda4ce9fa9397501dba75c527ed (diff)
1 files changed, 6 insertions, 39 deletions
diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py
index 291dd879..ffc573aa 100755
--- a/backend/src/libocl/script/gen_vector.py
+++ b/backend/src/libocl/script/gen_vector.py
@@ -289,42 +289,9 @@ class builtinProto():
             formatStr += ';'
             self.append(formatStr)
             return formatStr
-        if self.functionName != 'select' and ptypeSeqs[0] == ptypeSeqs[self.paramCount-1] and ptype[1] > 4:
-            formatStr += '\n{ \n  union{'
-            formatStr = self.append(formatStr, '    {0} va[{1}];'.format(vtype[0], vtype[1]))
-            formatStr = self.append(formatStr, '    {0}{1} vv{2};'.format(vtype[0], vtype[1], vtype[1]))
-            formatStr += '\n  }uret;'
-            formatStr += '\n  union{'
-            formatStr = self.append(formatStr, '    {0} pa[{1}];'.format(ptype[0], ptype[1]))
-            formatStr = self.append(formatStr, '    {0}{1} pv{2};'.format(ptype[0], ptype[1], ptype[1]))
-            formatStr += '\n  }'
-            for n in range(0, self.paramCount):
-              formatStr += 'usrc{0}'.format(n)
-              if n+1 != self.paramCount:
-                formatStr +=', '
-            formatStr += ';'
-
-            for n in range(0, self.paramCount):
-              formatStr = self.append(formatStr, '  usrc{0}.pv{1} = param{2};'.format(n, ptype[1], n))
-            formatStr = self.append(formatStr, '  for(int i =0; i < {0}; i++)'.format(ptype[1]))
-            formatStr += '\n    uret.va[i] = '
-            if self.prefix == 'relational' and self.functionName != 'bitselect' and self.functionName != 'select':
-              formatStr += '-'
-            formatStr += '{0}('.format(self.functionName)
-
-            for n in range(0, self.paramCount):
-              formatStr += 'usrc{0}.pa[i]'.format(n)
-              if n+1 != self.paramCount:
-                formatStr +=', '
-            formatStr += ');'
-            formatStr = self.append(formatStr, ' return uret.vv{0};'.format(vtype[1]))
-            formatStr += '\n}'
-            formatStr = self.append(formatStr)
-            return formatStr
-        else:
-          formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
-          self.indent = len(formatStr)
-          for j in range(0, vtype[1]):
+        formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
+        self.indent = len(formatStr)
+        for j in range(0, vtype[1]):
             if (j != 0):
                 formatStr += ','
                 if (j + 1) % 2 == 0:
@@ -359,10 +326,10 @@ class builtinProto():
 
             formatStr += ')'
 
-          formatStr += '); }\n'
-          self.append(formatStr)
+        formatStr += '); }\n'
+        self.append(formatStr)
 
-          return formatStr
+        return formatStr
 
     def output(self):
         for line in self.outputStr:
author	Ruiling Song <ruiling.song@intel.com>	2015-02-15 16:09:07 +0800
committer	Zhigang Gong <zhigang.gong@intel.com>	2015-02-25 16:35:59 +0800
commit	dd4874bb3e4909225a36660360dfb8e2dcdc0d47 (patch)
tree	1b5847d4fb58c59b699c38f52605ae2883a03b8f
parent	e276fc4c5d7e1cda4ce9fa9397501dba75c527ed (diff)