From f07a899158d1f38b231e9a073a6ff0053b04016f Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Wed, 5 Dec 2012 07:39:02 +0000
Subject: Fix build against recent versions of Clang.  Based on patch by
 Alastair Donaldson!

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@169362 91177308-0d34-0410-b5e6-96231b3b80d8
---
 compile-test.sh | 2 +-
 configure.py    | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/compile-test.sh b/compile-test.sh
index 7461811..47c7f38 100755
--- a/compile-test.sh
+++ b/compile-test.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 
-clang -ccc-host-triple nvptx--nvidiacl -Iptx-nvidiacl/include -Igeneric/include -Xclang -mlink-bitcode-file -Xclang nvptx--nvidiacl/lib/builtins.bc -include clc/clc.h -Dcl_clang_storage_class_specifiers -Dcl_khr_fp64 "$@"
+clang -target nvptx--nvidiacl -Iptx-nvidiacl/include -Igeneric/include -Xclang -mlink-bitcode-file -Xclang nvptx--nvidiacl/lib/builtins.bc -include clc/clc.h -Dcl_clang_storage_class_specifiers -Dcl_khr_fp64 "$@"
diff --git a/configure.py b/configure.py
index 546bbd3..9ae49b7 100755
--- a/configure.py
+++ b/configure.py
@@ -35,7 +35,8 @@ def llvm_config(args):
     sys.exit(1)
 
 llvm_bindir = llvm_config(['--bindir'])
-llvm_core_libs = llvm_config(['--ldflags', '--libs', 'core', 'bitreader', 'bitwriter'])
+llvm_core_libs = llvm_config(['--libs', 'core', 'bitreader', 'bitwriter']) + ' ' + \
+                 llvm_config(['--ldflags'])
 llvm_cxxflags = llvm_config(['--cxxflags']) + ' -fno-exceptions -fno-rtti'
 
 llvm_clang = os.path.join(llvm_bindir, 'clang')
@@ -97,7 +98,7 @@ for target in targets:
   install_files += [(incdir, incdir[len(srcdir)+1:]) for incdir in incdirs]
 
   # The rule for building a .bc file for the specified architecture using clang.
-  clang_bc_flags = "-ccc-host-triple %s -I`dirname $in` %s " \
+  clang_bc_flags = "-target %s -I`dirname $in` %s " \
                    "-Dcl_clang_storage_class_specifiers " \
                    "-Dcl_khr_fp64 " \
                    "-emit-llvm" % (target, clang_cl_includes)
-- 
cgit v1.2.3


From c1a78086615f94e8ba901d15a1b091e0df1c0fb2 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 1 Jan 2013 10:00:19 +0000
Subject: Update the copyright coredits -- Happy new year 2013!

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@171342 91177308-0d34-0410-b5e6-96231b3b80d8
---
 LICENSE.TXT | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LICENSE.TXT b/LICENSE.TXT
index e4df97b..30aab2f 100644
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@@ -11,7 +11,7 @@ Full text of the relevant licenses is included below.
 
 ==============================================================================
 
-Copyright (c) 2011-2012 by the contributors listed in CREDITS.TXT
+Copyright (c) 2011-2013 by the contributors listed in CREDITS.TXT
 
 All rights reserved.
 
@@ -43,7 +43,7 @@ SOFTWARE.
 
 ==============================================================================
 
-Copyright (c) 2011-2012 by the contributors listed in CREDITS.TXT
+Copyright (c) 2011-2013 by the contributors listed in CREDITS.TXT
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
-- 
cgit v1.2.3


From 2b198dcbab2303ebdb9892df1a0e2ea4f56cc235 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:18:59 +0000
Subject: r600: Initial support

This includes a get_global_id() implementation and function stubs for
the other workitem and synchronization functions.

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184975 91177308-0d34-0410-b5e6-96231b3b80d8
---
 configure.py                                |  2 +-
 r600/include/clc/synchronization/barrier.h  |  2 ++
 r600/include/clc/workitem/get_global_id.h   |  1 +
 r600/include/clc/workitem/get_global_size.h |  3 +++
 r600/include/clc/workitem/get_group_id.h    |  3 +++
 r600/include/clc/workitem/get_local_id.h    |  3 +++
 r600/include/clc/workitem/get_local_size.h  |  3 +++
 r600/include/clc/workitem/get_num_groups.h  |  3 +++
 r600/lib/SOURCES                            |  1 +
 r600/lib/workitem/get_global_id.cl          | 10 ++++++++++
 10 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 r600/include/clc/synchronization/barrier.h
 create mode 100644 r600/include/clc/workitem/get_global_id.h
 create mode 100644 r600/include/clc/workitem/get_global_size.h
 create mode 100644 r600/include/clc/workitem/get_group_id.h
 create mode 100644 r600/include/clc/workitem/get_local_id.h
 create mode 100644 r600/include/clc/workitem/get_local_size.h
 create mode 100644 r600/include/clc/workitem/get_num_groups.h
 create mode 100644 r600/lib/SOURCES
 create mode 100644 r600/lib/workitem/get_global_id.cl

diff --git a/configure.py b/configure.py
index 9ae49b7..4f63c5b 100755
--- a/configure.py
+++ b/configure.py
@@ -43,7 +43,7 @@ llvm_clang = os.path.join(llvm_bindir, 'clang')
 llvm_link = os.path.join(llvm_bindir, 'llvm-link')
 llvm_opt = os.path.join(llvm_bindir, 'opt')
 
-default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl']
+default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl', 'r600--']
 
 targets = args
 if not targets:
diff --git a/r600/include/clc/synchronization/barrier.h b/r600/include/clc/synchronization/barrier.h
new file mode 100644
index 0000000..7f150d4
--- /dev/null
+++ b/r600/include/clc/synchronization/barrier.h
@@ -0,0 +1,2 @@
+_CLC_INLINE void barrier(cl_mem_fence_flags flags) {
+}
diff --git a/r600/include/clc/workitem/get_global_id.h b/r600/include/clc/workitem/get_global_id.h
new file mode 100644
index 0000000..b61450f
--- /dev/null
+++ b/r600/include/clc/workitem/get_global_id.h
@@ -0,0 +1 @@
+size_t get_global_id(uint dim);
diff --git a/r600/include/clc/workitem/get_global_size.h b/r600/include/clc/workitem/get_global_size.h
new file mode 100644
index 0000000..afd9ae1
--- /dev/null
+++ b/r600/include/clc/workitem/get_global_size.h
@@ -0,0 +1,3 @@
+_CLC_INLINE size_t get_global_size(uint dim) {
+  return 0;
+}
diff --git a/r600/include/clc/workitem/get_group_id.h b/r600/include/clc/workitem/get_group_id.h
new file mode 100644
index 0000000..6862dba
--- /dev/null
+++ b/r600/include/clc/workitem/get_group_id.h
@@ -0,0 +1,3 @@
+_CLC_INLINE size_t get_group_id(uint dim) {
+  return 0;
+}
diff --git a/r600/include/clc/workitem/get_local_id.h b/r600/include/clc/workitem/get_local_id.h
new file mode 100644
index 0000000..22749cd
--- /dev/null
+++ b/r600/include/clc/workitem/get_local_id.h
@@ -0,0 +1,3 @@
+_CLC_INLINE size_t get_local_id(uint dim) {
+  return 0;
+}
diff --git a/r600/include/clc/workitem/get_local_size.h b/r600/include/clc/workitem/get_local_size.h
new file mode 100644
index 0000000..51d9762
--- /dev/null
+++ b/r600/include/clc/workitem/get_local_size.h
@@ -0,0 +1,3 @@
+_CLC_INLINE size_t get_local_size(uint dim) {
+  return 0;
+}
diff --git a/r600/include/clc/workitem/get_num_groups.h b/r600/include/clc/workitem/get_num_groups.h
new file mode 100644
index 0000000..fe1f343
--- /dev/null
+++ b/r600/include/clc/workitem/get_num_groups.h
@@ -0,0 +1,3 @@
+_CLC_INLINE size_t get_num_groups(uint dim) {
+  return 0;
+}
diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
new file mode 100644
index 0000000..0844030
--- /dev/null
+++ b/r600/lib/SOURCES
@@ -0,0 +1 @@
+workitem/get_global_id.cl
diff --git a/r600/lib/workitem/get_global_id.cl b/r600/lib/workitem/get_global_id.cl
new file mode 100644
index 0000000..9b0bd94
--- /dev/null
+++ b/r600/lib/workitem/get_global_id.cl
@@ -0,0 +1,10 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_global_id(uint dim) {
+  switch (dim) {
+  case 0:  return __builtin_r600_read_tgid_x()*__builtin_r600_read_ngroups_x()+__builtin_r600_read_tidig_x();
+  case 1:  return __builtin_r600_read_tgid_y()*__builtin_r600_read_ngroups_y()+__builtin_r600_read_tidig_y();
+  case 2:  return __builtin_r600_read_tgid_z()*__builtin_r600_read_ngroups_z()+__builtin_r600_read_tidig_z();
+  default: return 0;
+  }
+}
-- 
cgit v1.2.3


From 98d740bd592b44da1394434961cd42695e7e79f2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:19:39 +0000
Subject: r600: Fix get_global_id implementation

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184976 91177308-0d34-0410-b5e6-96231b3b80d8
---
 r600/lib/workitem/get_global_id.cl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/r600/lib/workitem/get_global_id.cl b/r600/lib/workitem/get_global_id.cl
index 9b0bd94..671f657 100644
--- a/r600/lib/workitem/get_global_id.cl
+++ b/r600/lib/workitem/get_global_id.cl
@@ -2,9 +2,9 @@
 
 _CLC_DEF size_t get_global_id(uint dim) {
   switch (dim) {
-  case 0:  return __builtin_r600_read_tgid_x()*__builtin_r600_read_ngroups_x()+__builtin_r600_read_tidig_x();
-  case 1:  return __builtin_r600_read_tgid_y()*__builtin_r600_read_ngroups_y()+__builtin_r600_read_tidig_y();
-  case 2:  return __builtin_r600_read_tgid_z()*__builtin_r600_read_ngroups_z()+__builtin_r600_read_tidig_z();
+  case 0:  return __builtin_r600_read_tgid_x()*__builtin_r600_read_local_size_x()+__builtin_r600_read_tidig_x();
+  case 1:  return __builtin_r600_read_tgid_y()*__builtin_r600_read_local_size_y()+__builtin_r600_read_tidig_y();
+  case 2:  return __builtin_r600_read_tgid_z()*__builtin_r600_read_local_size_z()+__builtin_r600_read_tidig_z();
   default: return 0;
   }
 }
-- 
cgit v1.2.3


From 760192413b4fd5f67d14a79c5c74d7f6fcea354c Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:19:44 +0000
Subject: r600: Add get_global_size() implementation

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184977 91177308-0d34-0410-b5e6-96231b3b80d8
---
 r600/include/clc/workitem/get_global_size.h |  4 +---
 r600/lib/SOURCES                            |  1 +
 r600/lib/workitem/get_global_size.cl        | 10 ++++++++++
 3 files changed, 12 insertions(+), 3 deletions(-)
 create mode 100644 r600/lib/workitem/get_global_size.cl

diff --git a/r600/include/clc/workitem/get_global_size.h b/r600/include/clc/workitem/get_global_size.h
index afd9ae1..8d7b9a1 100644
--- a/r600/include/clc/workitem/get_global_size.h
+++ b/r600/include/clc/workitem/get_global_size.h
@@ -1,3 +1 @@
-_CLC_INLINE size_t get_global_size(uint dim) {
-  return 0;
-}
+size_t get_global_size(uint dim);
diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
index 0844030..644d2f3 100644
--- a/r600/lib/SOURCES
+++ b/r600/lib/SOURCES
@@ -1 +1,2 @@
 workitem/get_global_id.cl
+workitem/get_global_size.cl
diff --git a/r600/lib/workitem/get_global_size.cl b/r600/lib/workitem/get_global_size.cl
new file mode 100644
index 0000000..4e47bbe
--- /dev/null
+++ b/r600/lib/workitem/get_global_size.cl
@@ -0,0 +1,10 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_global_size(uint dim) {
+  switch (dim) {
+  case 0: return __builtin_r600_read_global_size_x();
+  case 1: return __builtin_r600_read_global_size_y();
+  case 2: return __builtin_r600_read_global_size_z();
+  default: return 1;
+  }
+}
-- 
cgit v1.2.3


From 6af470663c821d5fc88798792aa2b9c84f636d6b Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:19:50 +0000
Subject: Move R600 headers into generic directory

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184978 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/workitem/get_global_id.h   | 2 +-
 generic/include/clc/workitem/get_global_size.h | 2 +-
 r600/include/clc/synchronization/barrier.h     | 2 --
 r600/include/clc/workitem/get_global_id.h      | 1 -
 r600/include/clc/workitem/get_global_size.h    | 1 -
 r600/include/clc/workitem/get_group_id.h       | 3 ---
 r600/include/clc/workitem/get_local_id.h       | 3 ---
 r600/include/clc/workitem/get_local_size.h     | 3 ---
 r600/include/clc/workitem/get_num_groups.h     | 3 ---
 9 files changed, 2 insertions(+), 18 deletions(-)
 delete mode 100644 r600/include/clc/synchronization/barrier.h
 delete mode 100644 r600/include/clc/workitem/get_global_id.h
 delete mode 100644 r600/include/clc/workitem/get_global_size.h
 delete mode 100644 r600/include/clc/workitem/get_group_id.h
 delete mode 100644 r600/include/clc/workitem/get_local_id.h
 delete mode 100644 r600/include/clc/workitem/get_local_size.h
 delete mode 100644 r600/include/clc/workitem/get_num_groups.h

diff --git a/generic/include/clc/workitem/get_global_id.h b/generic/include/clc/workitem/get_global_id.h
index 92759f1..b61450f 100644
--- a/generic/include/clc/workitem/get_global_id.h
+++ b/generic/include/clc/workitem/get_global_id.h
@@ -1 +1 @@
-_CLC_DECL size_t get_global_id(uint dim);
+size_t get_global_id(uint dim);
diff --git a/generic/include/clc/workitem/get_global_size.h b/generic/include/clc/workitem/get_global_size.h
index 2f83705..8d7b9a1 100644
--- a/generic/include/clc/workitem/get_global_size.h
+++ b/generic/include/clc/workitem/get_global_size.h
@@ -1 +1 @@
-_CLC_DECL size_t get_global_size(uint dim);
+size_t get_global_size(uint dim);
diff --git a/r600/include/clc/synchronization/barrier.h b/r600/include/clc/synchronization/barrier.h
deleted file mode 100644
index 7f150d4..0000000
--- a/r600/include/clc/synchronization/barrier.h
+++ /dev/null
@@ -1,2 +0,0 @@
-_CLC_INLINE void barrier(cl_mem_fence_flags flags) {
-}
diff --git a/r600/include/clc/workitem/get_global_id.h b/r600/include/clc/workitem/get_global_id.h
deleted file mode 100644
index b61450f..0000000
--- a/r600/include/clc/workitem/get_global_id.h
+++ /dev/null
@@ -1 +0,0 @@
-size_t get_global_id(uint dim);
diff --git a/r600/include/clc/workitem/get_global_size.h b/r600/include/clc/workitem/get_global_size.h
deleted file mode 100644
index 8d7b9a1..0000000
--- a/r600/include/clc/workitem/get_global_size.h
+++ /dev/null
@@ -1 +0,0 @@
-size_t get_global_size(uint dim);
diff --git a/r600/include/clc/workitem/get_group_id.h b/r600/include/clc/workitem/get_group_id.h
deleted file mode 100644
index 6862dba..0000000
--- a/r600/include/clc/workitem/get_group_id.h
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_INLINE size_t get_group_id(uint dim) {
-  return 0;
-}
diff --git a/r600/include/clc/workitem/get_local_id.h b/r600/include/clc/workitem/get_local_id.h
deleted file mode 100644
index 22749cd..0000000
--- a/r600/include/clc/workitem/get_local_id.h
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_INLINE size_t get_local_id(uint dim) {
-  return 0;
-}
diff --git a/r600/include/clc/workitem/get_local_size.h b/r600/include/clc/workitem/get_local_size.h
deleted file mode 100644
index 51d9762..0000000
--- a/r600/include/clc/workitem/get_local_size.h
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_INLINE size_t get_local_size(uint dim) {
-  return 0;
-}
diff --git a/r600/include/clc/workitem/get_num_groups.h b/r600/include/clc/workitem/get_num_groups.h
deleted file mode 100644
index fe1f343..0000000
--- a/r600/include/clc/workitem/get_num_groups.h
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_INLINE size_t get_num_groups(uint dim) {
-  return 0;
-}
-- 
cgit v1.2.3


From 0404dcc845c9a45f0315f2ef85dea3a720351f67 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:19:54 +0000
Subject: PTX: move implementations of work-item and synchronisation functions
 to lib, and add header files in generic.  Incorporates a patch by Tom
 Stellard!

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184979 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/workitem/get_global_id.h   | 2 +-
 generic/include/clc/workitem/get_global_size.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/generic/include/clc/workitem/get_global_id.h b/generic/include/clc/workitem/get_global_id.h
index b61450f..92759f1 100644
--- a/generic/include/clc/workitem/get_global_id.h
+++ b/generic/include/clc/workitem/get_global_id.h
@@ -1 +1 @@
-size_t get_global_id(uint dim);
+_CLC_DECL size_t get_global_id(uint dim);
diff --git a/generic/include/clc/workitem/get_global_size.h b/generic/include/clc/workitem/get_global_size.h
index 8d7b9a1..2f83705 100644
--- a/generic/include/clc/workitem/get_global_size.h
+++ b/generic/include/clc/workitem/get_global_size.h
@@ -1 +1 @@
-size_t get_global_size(uint dim);
+_CLC_DECL size_t get_global_size(uint dim);
-- 
cgit v1.2.3


From 86951a58f495168cfddb3cad84d87f94cf2f2aa3 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:00 +0000
Subject: R600: Replace cl implementations with LLVM IR implementation

This allows libclc to be built for R600 with upstream clang and LLVM.

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184980 91177308-0d34-0410-b5e6-96231b3b80d8
---
 r600/lib/SOURCES                     |  6 ++++--
 r600/lib/workitem/get_global_id.cl   | 10 ----------
 r600/lib/workitem/get_global_size.cl | 10 ----------
 r600/lib/workitem/get_global_size.ll | 18 ++++++++++++++++++
 r600/lib/workitem/get_group_id.ll    | 18 ++++++++++++++++++
 r600/lib/workitem/get_local_id.ll    | 18 ++++++++++++++++++
 r600/lib/workitem/get_local_size.ll  | 18 ++++++++++++++++++
 7 files changed, 76 insertions(+), 22 deletions(-)
 delete mode 100644 r600/lib/workitem/get_global_id.cl
 delete mode 100644 r600/lib/workitem/get_global_size.cl
 create mode 100644 r600/lib/workitem/get_global_size.ll
 create mode 100644 r600/lib/workitem/get_group_id.ll
 create mode 100644 r600/lib/workitem/get_local_id.ll
 create mode 100644 r600/lib/workitem/get_local_size.ll

diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
index 644d2f3..af8c8c8 100644
--- a/r600/lib/SOURCES
+++ b/r600/lib/SOURCES
@@ -1,2 +1,4 @@
-workitem/get_global_id.cl
-workitem/get_global_size.cl
+workitem/get_group_id.ll
+workitem/get_local_size.ll
+workitem/get_local_id.ll
+workitem/get_global_size.ll
diff --git a/r600/lib/workitem/get_global_id.cl b/r600/lib/workitem/get_global_id.cl
deleted file mode 100644
index 671f657..0000000
--- a/r600/lib/workitem/get_global_id.cl
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <clc/clc.h>
-
-_CLC_DEF size_t get_global_id(uint dim) {
-  switch (dim) {
-  case 0:  return __builtin_r600_read_tgid_x()*__builtin_r600_read_local_size_x()+__builtin_r600_read_tidig_x();
-  case 1:  return __builtin_r600_read_tgid_y()*__builtin_r600_read_local_size_y()+__builtin_r600_read_tidig_y();
-  case 2:  return __builtin_r600_read_tgid_z()*__builtin_r600_read_local_size_z()+__builtin_r600_read_tidig_z();
-  default: return 0;
-  }
-}
diff --git a/r600/lib/workitem/get_global_size.cl b/r600/lib/workitem/get_global_size.cl
deleted file mode 100644
index 4e47bbe..0000000
--- a/r600/lib/workitem/get_global_size.cl
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <clc/clc.h>
-
-_CLC_DEF size_t get_global_size(uint dim) {
-  switch (dim) {
-  case 0: return __builtin_r600_read_global_size_x();
-  case 1: return __builtin_r600_read_global_size_y();
-  case 2: return __builtin_r600_read_global_size_z();
-  default: return 1;
-  }
-}
diff --git a/r600/lib/workitem/get_global_size.ll b/r600/lib/workitem/get_global_size.ll
new file mode 100644
index 0000000..ac2d08d
--- /dev/null
+++ b/r600/lib/workitem/get_global_size.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.global.size.x() nounwind readnone
+declare i32 @llvm.r600.read.global.size.y() nounwind readnone
+declare i32 @llvm.r600.read.global.size.z() nounwind readnone
+
+define i32 @get_global_size(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.global.size.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.global.size.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.global.size.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/r600/lib/workitem/get_group_id.ll b/r600/lib/workitem/get_group_id.ll
new file mode 100644
index 0000000..5131648
--- /dev/null
+++ b/r600/lib/workitem/get_group_id.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.local.size.x() nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() nounwind readnone
+
+define i32 @get_group_id(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.local.size.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.local.size.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.local.size.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/r600/lib/workitem/get_local_id.ll b/r600/lib/workitem/get_local_id.ll
new file mode 100644
index 0000000..ac5522a
--- /dev/null
+++ b/r600/lib/workitem/get_local_id.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() nounwind readnone
+declare i32 @llvm.r600.read.tidig.z() nounwind readnone
+
+define i32 @get_local_id(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.tidig.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.tidig.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/r600/lib/workitem/get_local_size.ll b/r600/lib/workitem/get_local_size.ll
new file mode 100644
index 0000000..6a71f75
--- /dev/null
+++ b/r600/lib/workitem/get_local_size.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.tgid.x() nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() nounwind readnone
+declare i32 @llvm.r600.read.tgid.z() nounwind readnone
+
+define i32 @get_local_size(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.tgid.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.tgid.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.tgid.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
-- 
cgit v1.2.3


From b234c94b0ea349c66c62eb669171d729104e5737 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:03 +0000
Subject: Make libclc more Linux FHS conform.

- First introducing a versioning scheme
- Add --libexecdir, --includedir and --pkgconfigdir and prefill them as well as --prefix
- Build all targets by default
- Create clc.pc and install it in $pkgconfigdir
- Use clang++ instead of c++
- Rename builtins.bc to built_libs/$triple.bc and install them in $libexecdir
- Includes are installed recursively to $includedir
- Finally add $(DESTDIR) for 'make install'

Patch by: Johannes Obermayr

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184981 91177308-0d34-0410-b5e6-96231b3b80d8
---
 configure.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/configure.py b/configure.py
index 4f63c5b..a35537a 100755
--- a/configure.py
+++ b/configure.py
@@ -4,6 +4,10 @@ def c_compiler_rule(b, name, description, compiler, flags):
   command = "%s -MMD -MF $out.d %s -c -o $out $in" % (compiler, flags)
   b.rule(name, command, description + " $out", depfile="$out.d")
 
+version_major = 0;
+version_minor = 0;
+version_patch = 1;
+
 from optparse import OptionParser
 import os
 from subprocess import *
@@ -19,12 +23,34 @@ p.add_option('--with-llvm-config', metavar='PATH',
              help='use given llvm-config script')
 p.add_option('--prefix', metavar='PATH',
              help='install to given prefix')
+p.add_option('--libexecdir', metavar='PATH',
+             help='install *.bc to given dir')
+p.add_option('--includedir', metavar='PATH',
+             help='install include files to given dir')
+p.add_option('--pkgconfigdir', metavar='PATH',
+             help='install clc.pc to given dir')
 p.add_option('-g', metavar='GENERATOR', default='make',
              help='use given generator (default: make)')
 (options, args) = p.parse_args()
 
 llvm_config_exe = options.with_llvm_config or "llvm-config"
 
+prefix = options.prefix
+if not prefix:
+  prefix = '/usr/local'
+
+libexecdir = options.libexecdir
+if not libexecdir:
+  libexecdir = os.path.join(prefix, 'lib/clc')
+
+includedir = options.includedir
+if not includedir:
+  includedir = os.path.join(prefix, 'include')
+
+pkgconfigdir = options.pkgconfigdir
+if not pkgconfigdir:
+  pkgconfigdir = os.path.join(prefix, 'lib/pkgconfig')
+
 def llvm_config(args):
   try:
     proc = Popen([llvm_config_exe] + args, stdout=PIPE)
@@ -58,8 +84,8 @@ b.rule("LLVM_LINK", command = llvm_link + " -o $out $in",
 b.rule("OPT", command = llvm_opt + " -O3 -o $out $in",
        description = 'OPT $out')
 
-c_compiler_rule(b, "LLVM_TOOL_CXX", 'CXX', 'c++', llvm_cxxflags)
-b.rule("LLVM_TOOL_LINK", "c++ -o $out $in %s" % llvm_core_libs, 'LINK $out')
+c_compiler_rule(b, "LLVM_TOOL_CXX", 'LLVM-CXX', 'clang++', llvm_cxxflags)
+b.rule("LLVM_TOOL_LINK", "clang++ -o $out $in %s" % llvm_core_libs, 'LINK $out')
 
 prepare_builtins = os.path.join('utils', 'prepare-builtins')
 b.build(os.path.join('utils', 'prepare-builtins.o'), "LLVM_TOOL_CXX",
@@ -73,9 +99,15 @@ b.rule("PREPARE_BUILTINS", "%s -o $out $in" % prepare_builtins,
 manifest_deps = set([sys.argv[0], os.path.join(srcdir, 'build', 'metabuild.py'),
                      os.path.join(srcdir, 'build', 'ninja_syntax.py')])
 
-install_files = []
+install_files_bc = []
 install_deps = []
 
+# Create libclc.pc
+clc = open('libclc.pc', 'w')
+clc.write('includedir=%(inc)s\nlibexecdir=%(lib)s\n\nName: libclc\nDescription: Library requirements of the OpenCL C programming language\nVersion: %(maj)s.%(min)s.%(pat)s\nCflags: -I${includedir}\nLibs: -L${libexecdir}' %
+{'inc': includedir, 'lib': libexecdir, 'maj': version_major, 'min': version_minor, 'pat': version_patch})
+clc.close()
+
 for target in targets:
   (t_arch, t_vendor, t_os) = target.split('-')
   archs = [t_arch]
@@ -95,7 +127,6 @@ for target in targets:
                    [os.path.join(srcdir, subdir, 'lib') for subdir in subdirs])
 
   clang_cl_includes = ' '.join(["-I%s" % incdir for incdir in incdirs])
-  install_files += [(incdir, incdir[len(srcdir)+1:]) for incdir in incdirs]
 
   # The rule for building a .bc file for the specified architecture using clang.
   clang_bc_flags = "-target %s -I`dirname $in` %s " \
@@ -126,22 +157,28 @@ for target in targets:
 
   builtins_link_bc = os.path.join(target, 'lib', 'builtins.link.bc')
   builtins_opt_bc = os.path.join(target, 'lib', 'builtins.opt.bc')
-  builtins_bc = os.path.join(target, 'lib', 'builtins.bc')
+  builtins_bc = os.path.join('built_libs', target + '.bc')
   b.build(builtins_link_bc, "LLVM_LINK", objects)
   b.build(builtins_opt_bc, "OPT", builtins_link_bc)
   b.build(builtins_bc, "PREPARE_BUILTINS", builtins_opt_bc, prepare_builtins)
-  install_files.append((builtins_bc, builtins_bc))
+  install_files_bc.append((builtins_bc, builtins_bc))
   install_deps.append(builtins_bc)
   b.default(builtins_bc)
 
-if options.prefix:
-  install_cmd = ' && '.join(['mkdir -p %(dst)s && cp -r %(src)s %(dst)s' % 
-                             {'src': file,
-                              'dst': os.path.join(options.prefix,
-                                                  os.path.dirname(dest))}
-                             for (file, dest) in install_files])
-  b.rule('install', command = install_cmd, description = 'INSTALL')
-  b.build('install', 'install', install_deps)
+
+install_cmd = ' && '.join(['mkdir -p $(DESTDIR)/%(dst)s && cp -r %(src)s $(DESTDIR)/%(dst)s' % 
+                           {'src': file,
+                            'dst': libexecdir}
+                           for (file, dest) in install_files_bc])
+install_cmd = ' && '.join(['%(old)s && mkdir -p $(DESTDIR)/%(dst)s && cp -r generic/include/clc $(DESTDIR)/%(dst)s' %
+                           {'old': install_cmd,
+                            'dst': includedir}])
+install_cmd = ' && '.join(['%(old)s && mkdir -p $(DESTDIR)/%(dst)s && cp -r libclc.pc $(DESTDIR)/%(dst)s' %
+                           {'old': install_cmd, 
+                            'dst': pkgconfigdir}])
+  
+b.rule('install', command = install_cmd, description = 'INSTALL')
+b.build('install', 'install', install_deps)
 
 b.rule("configure", command = ' '.join(sys.argv), description = 'CONFIGURE',
        generator = True)
-- 
cgit v1.2.3


From 513ac41c593d84a18de40761118d85d0b4dc73a8 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:05 +0000
Subject: Allow targets to override generic implementations

Targets can override generic implementations by adding a file called
OVERRIDES in $(TARGET_DIR)/lib and listing the generic implementations
that it wants to override.  For example, to override get_group_id() and
get_global_size() you would add these lines to the OVERRIDES file:

workitem/get_group_id.cl
workitem/get_global_size.cl

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184982 91177308-0d34-0410-b5e6-96231b3b80d8
---
 configure.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/configure.py b/configure.py
index a35537a..ec443ad 100755
--- a/configure.py
+++ b/configure.py
@@ -142,6 +142,14 @@ for target in targets:
   for libdir in libdirs:
     subdir_list_file = os.path.join(libdir, 'SOURCES')
     manifest_deps.add(subdir_list_file)
+    override_list_file = os.path.join(libdir, 'OVERRIDES')
+
+    # Add target overrides
+    if os.path.exists(override_list_file):
+      for override in open(override_list_file).readlines():
+        override = override.rstrip()
+        sources_seen.add(override)
+
     for src in open(subdir_list_file).readlines():
       src = src.rstrip()
       if src not in sources_seen:
-- 
cgit v1.2.3


From 655cb98d3733a8a052a57188b9888ff85fbbeda0 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:08 +0000
Subject: r600: Add overrides file

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184983 91177308-0d34-0410-b5e6-96231b3b80d8
---
 r600/lib/OVERRIDES | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 r600/lib/OVERRIDES

diff --git a/r600/lib/OVERRIDES b/r600/lib/OVERRIDES
new file mode 100644
index 0000000..3f941d8
--- /dev/null
+++ b/r600/lib/OVERRIDES
@@ -0,0 +1,2 @@
+workitem/get_group_id.cl
+workitem/get_global_size.cl
-- 
cgit v1.2.3


From ba36ac1c11290f2985d9f1ddd59ec39221516b39 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:12 +0000
Subject: Fix typo in include/clc/geometric/length.inc

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184984 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/geometric/length.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/include/clc/geometric/length.inc b/generic/include/clc/geometric/length.inc
index 8ee8bf3..a9fa2d5 100644
--- a/generic/include/clc/geometric/length.inc
+++ b/generic/include/clc/geometric/length.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL float length(FLOATN p0);
+_CLC_OVERLOAD _CLC_DECL FLOAT length(FLOATN p0);
-- 
cgit v1.2.3


From 35e27a92509a2acb10a8dab85b969818403dfb46 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:15 +0000
Subject: Use brackets around include files in length.cl and normalize.cl

These functions were not being compiled

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184985 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/geometric/length.cl    | 2 +-
 generic/lib/geometric/normalize.cl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/generic/lib/geometric/length.cl b/generic/lib/geometric/length.cl
index cbe84a0..e26f2b8 100644
--- a/generic/lib/geometric/length.cl
+++ b/generic/lib/geometric/length.cl
@@ -4,5 +4,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY "length.inc"
+#define BODY <length.inc>
 #include <clc/geometric/floatn.inc>
diff --git a/generic/lib/geometric/normalize.cl b/generic/lib/geometric/normalize.cl
index e5a521f..70d255d 100644
--- a/generic/lib/geometric/normalize.cl
+++ b/generic/lib/geometric/normalize.cl
@@ -4,5 +4,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY "normalize.inc"
+#define BODY <normalize.inc>
 #include <clc/geometric/floatn.inc>
-- 
cgit v1.2.3


From b08be4f10886231672844974d8671ddd47276dc7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:18 +0000
Subject: Remove the static keyword from the _CLC_INLINE macro

static functions are not allowed in OpenCL C

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184986 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clcfunc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/include/clc/clcfunc.h b/generic/include/clc/clcfunc.h
index 46067fc..5f166c5 100644
--- a/generic/include/clc/clcfunc.h
+++ b/generic/include/clc/clcfunc.h
@@ -1,4 +1,4 @@
 #define _CLC_OVERLOAD __attribute__((overloadable))
 #define _CLC_DECL
 #define _CLC_DEF __attribute__((always_inline))
-#define _CLC_INLINE __attribute__((always_inline)) static inline
+#define _CLC_INLINE __attribute__((always_inline)) inline
-- 
cgit v1.2.3


From 0fd3463974cf8b2aad009ee97dd240222ffd7065 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:25 +0000
Subject: Implement fmax() and fmin() builtins

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184987 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h                |  2 ++
 generic/include/clc/math/binary_decl.inc |  6 ++++++
 generic/include/clc/math/fmax.h          | 11 +++++++++++
 generic/include/clc/math/fmin.h          | 11 +++++++++++
 generic/include/clc/math/gentype.inc     |  4 ++++
 generic/lib/SOURCES                      |  2 ++
 generic/lib/math/binary_impl.inc         | 18 ++++++++++++++++++
 generic/lib/math/fmax.cl                 | 11 +++++++++++
 generic/lib/math/fmin.cl                 | 11 +++++++++++
 9 files changed, 76 insertions(+)
 create mode 100644 generic/include/clc/math/binary_decl.inc
 create mode 100644 generic/include/clc/math/fmax.h
 create mode 100644 generic/include/clc/math/fmin.h
 create mode 100644 generic/lib/math/binary_impl.inc
 create mode 100644 generic/lib/math/fmax.cl
 create mode 100644 generic/lib/math/fmin.cl

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 315693b..c917a46 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -38,6 +38,8 @@
 #include <clc/math/fabs.h>
 #include <clc/math/floor.h>
 #include <clc/math/fma.h>
+#include <clc/math/fmax.h>
+#include <clc/math/fmin.h>
 #include <clc/math/hypot.h>
 #include <clc/math/log.h>
 #include <clc/math/log2.h>
diff --git a/generic/include/clc/math/binary_decl.inc b/generic/include/clc/math/binary_decl.inc
new file mode 100644
index 0000000..1a49e26
--- /dev/null
+++ b/generic/include/clc/math/binary_decl.inc
@@ -0,0 +1,6 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE FUNCTION(GENTYPE a, GENTYPE b);
+_CLC_OVERLOAD _CLC_DECL GENTYPE FUNCTION(GENTYPE a, float b);
+
+#ifdef cl_khr_fp64
+_CLC_OVERLOAD _CLC_DECL GENTYPE FUNCTION(GENTYPE a, double b);
+#endif
diff --git a/generic/include/clc/math/fmax.h b/generic/include/clc/math/fmax.h
new file mode 100644
index 0000000..d26e5d6
--- /dev/null
+++ b/generic/include/clc/math/fmax.h
@@ -0,0 +1,11 @@
+#undef fmax
+#define fmax __clc_fmax
+
+#define BODY <clc/math/binary_decl.inc>
+#define FUNCTION __clc_fmax
+
+#include <clc/math/gentype.inc>
+
+#undef BODY
+#undef FUNCTION
+
diff --git a/generic/include/clc/math/fmin.h b/generic/include/clc/math/fmin.h
new file mode 100644
index 0000000..3506aef
--- /dev/null
+++ b/generic/include/clc/math/fmin.h
@@ -0,0 +1,11 @@
+#undef fmin
+#define fmin __clc_fmin
+
+#define BODY <clc/math/binary_decl.inc>
+#define FUNCTION __clc_fmin
+
+#include <clc/math/gentype.inc>
+
+#undef BODY
+#undef FUNCTION
+
diff --git a/generic/include/clc/math/gentype.inc b/generic/include/clc/math/gentype.inc
index 4506920..b525c4b 100644
--- a/generic/include/clc/math/gentype.inc
+++ b/generic/include/clc/math/gentype.inc
@@ -1,6 +1,8 @@
 #define GENTYPE float
+#define SCALAR
 #include BODY
 #undef GENTYPE
+#undef SCALAR
 
 #define GENTYPE float2
 #include BODY
@@ -23,9 +25,11 @@
 #undef GENTYPE
 
 #ifdef cl_khr_fp64
+#define SCALAR
 #define GENTYPE double
 #include BODY
 #undef GENTYPE
+#undef SCALAR
 
 #define GENTYPE double2
 #include BODY
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index d29ca1f..86c008b 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -10,6 +10,8 @@ integer/add_sat_impl.ll
 integer/sub_sat.cl
 integer/sub_sat.ll
 integer/sub_sat_impl.ll
+math/fmax.cl
+math/fmin.cl
 math/hypot.cl
 math/mad.cl
 relational/any.cl
diff --git a/generic/lib/math/binary_impl.inc b/generic/lib/math/binary_impl.inc
new file mode 100644
index 0000000..e4b1e5f
--- /dev/null
+++ b/generic/lib/math/binary_impl.inc
@@ -0,0 +1,18 @@
+
+#ifndef SCALAR
+
+_CLC_OVERLOAD _CLC_DEF GENTYPE FUNCTION(GENTYPE x, GENTYPE y) {
+  return FUNCTION_IMPL(x, y);
+}
+
+#endif
+
+_CLC_OVERLOAD _CLC_DEF GENTYPE FUNCTION(GENTYPE x, double y) {
+  GENTYPE vec_y = (GENTYPE) (y);
+  return FUNCTION_IMPL(x, vec_y);
+}
+
+_CLC_OVERLOAD _CLC_DEF GENTYPE FUNCTION(GENTYPE x, float y) {
+  GENTYPE vec_y = (GENTYPE) (y);
+  return FUNCTION_IMPL(x, vec_y);
+}
diff --git a/generic/lib/math/fmax.cl b/generic/lib/math/fmax.cl
new file mode 100644
index 0000000..68a67ac
--- /dev/null
+++ b/generic/lib/math/fmax.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define FUNCTION __clc_fmax
+#define FUNCTION_IMPL(x, y) ((x) < (y) ? (y) : (x))
+
+#define BODY <binary_impl.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/lib/math/fmin.cl b/generic/lib/math/fmin.cl
new file mode 100644
index 0000000..cac188e
--- /dev/null
+++ b/generic/lib/math/fmin.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define FUNCTION __clc_fmin
+#define FUNCTION_IMPL(x, y) ((y) < (x) ? (y) : (x))
+
+#define BODY <binary_impl.inc>
+#include <clc/math/gentype.inc>
-- 
cgit v1.2.3


From b77eb2d4a7c4d59793ec33752cb1d5ca573b793d Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:30 +0000
Subject: Implement ceil() builtin

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184988 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h       | 1 +
 generic/include/clc/math/ceil.h | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 generic/include/clc/math/ceil.h

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index c917a46..4394c9e 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -33,6 +33,7 @@
 
 /* 6.11.2 Math Functions */
 #include <clc/math/cos.h>
+#include <clc/math/ceil.h>
 #include <clc/math/exp.h>
 #include <clc/math/exp2.h>
 #include <clc/math/fabs.h>
diff --git a/generic/include/clc/math/ceil.h b/generic/include/clc/math/ceil.h
new file mode 100644
index 0000000..b8e4b08
--- /dev/null
+++ b/generic/include/clc/math/ceil.h
@@ -0,0 +1,6 @@
+#undef ceil
+#define ceil __clc_ceil
+
+#define FUNCTION __clc_ceil
+#define INTRINSIC "llvm.ceil"
+#include <clc/math/unary_intrin.inc>
-- 
cgit v1.2.3


From 58ccf6f97d28b81871dc8a85550970b8a9f451d0 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:32 +0000
Subject: Fix build with LLVM 3.3

Patch by: Niels Ole Salscheider

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184989 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/prepare-builtins.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utils/prepare-builtins.cpp b/utils/prepare-builtins.cpp
index ae7731b..be1624b 100644
--- a/utils/prepare-builtins.cpp
+++ b/utils/prepare-builtins.cpp
@@ -1,9 +1,9 @@
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
-- 
cgit v1.2.3


From 9aba71dfdcb47ea1529362290c9a13bd27ca0d5f Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:35 +0000
Subject: configure: fix out-of-source build

Patch by: Niels Ole Salscheider

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184990 91177308-0d34-0410-b5e6-96231b3b80d8
---
 configure.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/configure.py b/configure.py
index ec443ad..3dd9836 100755
--- a/configure.py
+++ b/configure.py
@@ -178,9 +178,10 @@ install_cmd = ' && '.join(['mkdir -p $(DESTDIR)/%(dst)s && cp -r %(src)s $(DESTD
                            {'src': file,
                             'dst': libexecdir}
                            for (file, dest) in install_files_bc])
-install_cmd = ' && '.join(['%(old)s && mkdir -p $(DESTDIR)/%(dst)s && cp -r generic/include/clc $(DESTDIR)/%(dst)s' %
+install_cmd = ' && '.join(['%(old)s && mkdir -p $(DESTDIR)/%(dst)s && cp -r %(srcdir)s/generic/include/clc $(DESTDIR)/%(dst)s' %
                            {'old': install_cmd,
-                            'dst': includedir}])
+                            'dst': includedir,
+                            'srcdir': srcdir}])
 install_cmd = ' && '.join(['%(old)s && mkdir -p $(DESTDIR)/%(dst)s && cp -r libclc.pc $(DESTDIR)/%(dst)s' %
                            {'old': install_cmd, 
                             'dst': pkgconfigdir}])
-- 
cgit v1.2.3


From 7dc9242ba50572a73003bcada7a993b5eaa787a5 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:38 +0000
Subject: configure: Enable building separate libraries for target variants

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184991 91177308-0d34-0410-b5e6-96231b3b80d8
---
 configure.py | 117 +++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 44 deletions(-)

diff --git a/configure.py b/configure.py
index 3dd9836..79cc4df 100755
--- a/configure.py
+++ b/configure.py
@@ -69,6 +69,17 @@ llvm_clang = os.path.join(llvm_bindir, 'clang')
 llvm_link = os.path.join(llvm_bindir, 'llvm-link')
 llvm_opt = os.path.join(llvm_bindir, 'opt')
 
+available_targets = {
+  'r600--' : { 'devices' :
+               [{'gpu' : 'cedar',   'aliases' : ['palm', 'sumo', 'sumo2', 'redwood', 'juniper']},
+                {'gpu' : 'cypress', 'aliases' : ['hemlock']},
+                {'gpu' : 'barts',   'aliases' : ['turks', 'caicos']},
+                {'gpu' : 'cayman',  'aliases' : ['aruba']},
+                {'gpu' : 'tahiti',  'aliases' : ['pitcairn', 'verde', 'oland']}]},
+  'nvptx--nvidiacl'   : { 'devices' : [{'gpu' : '', 'aliases' : []}] },
+  'nvptx64--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []}] }
+}
+
 default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl', 'r600--']
 
 targets = args
@@ -128,50 +139,68 @@ for target in targets:
 
   clang_cl_includes = ' '.join(["-I%s" % incdir for incdir in incdirs])
 
-  # The rule for building a .bc file for the specified architecture using clang.
-  clang_bc_flags = "-target %s -I`dirname $in` %s " \
-                   "-Dcl_clang_storage_class_specifiers " \
-                   "-Dcl_khr_fp64 " \
-                   "-emit-llvm" % (target, clang_cl_includes)
-  clang_bc_rule = "CLANG_CL_BC_" + target
-  c_compiler_rule(b, clang_bc_rule, "LLVM-CC", llvm_clang, clang_bc_flags)
-  
-  objects = []
-  sources_seen = set()
-
-  for libdir in libdirs:
-    subdir_list_file = os.path.join(libdir, 'SOURCES')
-    manifest_deps.add(subdir_list_file)
-    override_list_file = os.path.join(libdir, 'OVERRIDES')
-
-    # Add target overrides
-    if os.path.exists(override_list_file):
-      for override in open(override_list_file).readlines():
-        override = override.rstrip()
-        sources_seen.add(override)
-
-    for src in open(subdir_list_file).readlines():
-      src = src.rstrip()
-      if src not in sources_seen:
-        sources_seen.add(src)
-        obj = os.path.join(target, 'lib', src + '.bc')
-        objects.append(obj)
-        src_file = os.path.join(libdir, src)
-        ext = os.path.splitext(src)[1]
-        if ext == '.ll':
-          b.build(obj, 'LLVM_AS', src_file)
-        else:
-          b.build(obj, clang_bc_rule, src_file)
-
-  builtins_link_bc = os.path.join(target, 'lib', 'builtins.link.bc')
-  builtins_opt_bc = os.path.join(target, 'lib', 'builtins.opt.bc')
-  builtins_bc = os.path.join('built_libs', target + '.bc')
-  b.build(builtins_link_bc, "LLVM_LINK", objects)
-  b.build(builtins_opt_bc, "OPT", builtins_link_bc)
-  b.build(builtins_bc, "PREPARE_BUILTINS", builtins_opt_bc, prepare_builtins)
-  install_files_bc.append((builtins_bc, builtins_bc))
-  install_deps.append(builtins_bc)
-  b.default(builtins_bc)
+  for device in available_targets[target]['devices']:
+    # The rule for building a .bc file for the specified architecture using clang.
+    clang_bc_flags = "-target %s -I`dirname $in` %s " \
+                     "-Dcl_clang_storage_class_specifiers " \
+                     "-Dcl_khr_fp64 " \
+                     "-emit-llvm" % (target, clang_cl_includes)
+    if device['gpu'] != '':
+      clang_bc_flags += ' -mcpu=' + device['gpu']
+    clang_bc_rule = "CLANG_CL_BC_" + target
+    c_compiler_rule(b, clang_bc_rule, "LLVM-CC", llvm_clang, clang_bc_flags)
+
+    objects = []
+    sources_seen = set()
+
+    if device['gpu'] == '':
+      full_target_name = target
+      obj_suffix = ''
+    else:
+      full_target_name = device['gpu'] + '-' + target
+      obj_suffix = '.' + device['gpu']
+
+    for libdir in libdirs:
+      subdir_list_file = os.path.join(libdir, 'SOURCES')
+      manifest_deps.add(subdir_list_file)
+      override_list_file = os.path.join(libdir, 'OVERRIDES')
+
+      # Add target overrides
+      if os.path.exists(override_list_file):
+        for override in open(override_list_file).readlines():
+          override = override.rstrip()
+          sources_seen.add(override)
+
+      for src in open(subdir_list_file).readlines():
+        src = src.rstrip()
+        if src not in sources_seen:
+          sources_seen.add(src)
+          obj = os.path.join(target, 'lib', src + obj_suffix + '.bc')
+          objects.append(obj)
+          src_file = os.path.join(libdir, src)
+          ext = os.path.splitext(src)[1]
+          if ext == '.ll':
+            b.build(obj, 'LLVM_AS', src_file)
+          else:
+            b.build(obj, clang_bc_rule, src_file)
+
+    builtins_link_bc = os.path.join(target, 'lib', 'builtins.link' + obj_suffix + '.bc')
+    builtins_opt_bc = os.path.join(target, 'lib', 'builtins.opt' + obj_suffix + '.bc')
+    builtins_bc = os.path.join('built_libs', full_target_name + '.bc')
+    b.build(builtins_link_bc, "LLVM_LINK", objects)
+    b.build(builtins_opt_bc, "OPT", builtins_link_bc)
+    b.build(builtins_bc, "PREPARE_BUILTINS", builtins_opt_bc, prepare_builtins)
+    install_files_bc.append((builtins_bc, builtins_bc))
+    install_deps.append(builtins_bc)
+    for alias in device['aliases']:
+      b.rule("CREATE_ALIAS", "ln -fs %s $out" % os.path.basename(builtins_bc)
+             ,"CREATE-ALIAS $out")
+
+      alias_file = os.path.join('built_libs', alias + '-' + target + '.bc')
+      b.build(alias_file, "CREATE_ALIAS", builtins_bc)
+      install_files_bc.append((alias_file, alias_file))
+      install_deps.append(alias_file)
+    b.default(builtins_bc)
 
 
 install_cmd = ' && '.join(['mkdir -p $(DESTDIR)/%(dst)s && cp -r %(src)s $(DESTDIR)/%(dst)s' % 
-- 
cgit v1.2.3


From 485cf99dbb8ef2e08c3a7d5c970a556546df73a6 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:46 +0000
Subject: libclc: Add max() builtin function

Adds this function for both int and floating data types.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184992 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h           | 2 ++
 generic/include/clc/integer/max.h   | 2 ++
 generic/include/clc/integer/max.inc | 1 +
 generic/include/clc/math/max.h      | 2 ++
 generic/include/clc/math/max.inc    | 1 +
 generic/lib/SOURCES                 | 2 ++
 generic/lib/integer/max.cl          | 4 ++++
 generic/lib/integer/max.inc         | 3 +++
 generic/lib/math/max.cl             | 8 ++++++++
 generic/lib/math/max.inc            | 3 +++
 10 files changed, 28 insertions(+)
 create mode 100644 generic/include/clc/integer/max.h
 create mode 100644 generic/include/clc/integer/max.inc
 create mode 100644 generic/include/clc/math/max.h
 create mode 100644 generic/include/clc/math/max.inc
 create mode 100644 generic/lib/integer/max.cl
 create mode 100644 generic/lib/integer/max.inc
 create mode 100644 generic/lib/math/max.cl
 create mode 100644 generic/lib/math/max.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 4394c9e..f6668a3 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -45,6 +45,7 @@
 #include <clc/math/log.h>
 #include <clc/math/log2.h>
 #include <clc/math/mad.h>
+#include <clc/math/max.h>
 #include <clc/math/pow.h>
 #include <clc/math/sin.h>
 #include <clc/math/sqrt.h>
@@ -63,6 +64,7 @@
 #include <clc/integer/abs.h>
 #include <clc/integer/abs_diff.h>
 #include <clc/integer/add_sat.h>
+#include <clc/integer/max.h>
 #include <clc/integer/sub_sat.h>
 
 /* 6.11.5 Geometric Functions */
diff --git a/generic/include/clc/integer/max.h b/generic/include/clc/integer/max.h
new file mode 100644
index 0000000..e74a459
--- /dev/null
+++ b/generic/include/clc/integer/max.h
@@ -0,0 +1,2 @@
+#define BODY <clc/integer/max.inc>
+#include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/max.inc b/generic/include/clc/integer/max.inc
new file mode 100644
index 0000000..ce6c6d0
--- /dev/null
+++ b/generic/include/clc/integer/max.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, GENTYPE b);
diff --git a/generic/include/clc/math/max.h b/generic/include/clc/math/max.h
new file mode 100644
index 0000000..3d158f1
--- /dev/null
+++ b/generic/include/clc/math/max.h
@@ -0,0 +1,2 @@
+#define BODY <clc/math/max.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/include/clc/math/max.inc b/generic/include/clc/math/max.inc
new file mode 100644
index 0000000..ce6c6d0
--- /dev/null
+++ b/generic/include/clc/math/max.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, GENTYPE b);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 86c008b..b593941 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -7,6 +7,7 @@ integer/abs.cl
 integer/add_sat.cl
 integer/add_sat.ll
 integer/add_sat_impl.ll
+integer/max.cl
 integer/sub_sat.cl
 integer/sub_sat.ll
 integer/sub_sat_impl.ll
@@ -14,6 +15,7 @@ math/fmax.cl
 math/fmin.cl
 math/hypot.cl
 math/mad.cl
+math/max.cl
 relational/any.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/integer/max.cl b/generic/lib/integer/max.cl
new file mode 100644
index 0000000..89fec7c
--- /dev/null
+++ b/generic/lib/integer/max.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define BODY <max.inc>
+#include <clc/integer/gentype.inc>
diff --git a/generic/lib/integer/max.inc b/generic/lib/integer/max.inc
new file mode 100644
index 0000000..37409fc
--- /dev/null
+++ b/generic/lib/integer/max.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, GENTYPE b) {
+  return (a > b ? a : b);
+}
diff --git a/generic/lib/math/max.cl b/generic/lib/math/max.cl
new file mode 100644
index 0000000..d1254a7
--- /dev/null
+++ b/generic/lib/math/max.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define BODY <max.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/lib/math/max.inc b/generic/lib/math/max.inc
new file mode 100644
index 0000000..37409fc
--- /dev/null
+++ b/generic/lib/math/max.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, GENTYPE b) {
+  return (a > b ? a : b);
+}
-- 
cgit v1.2.3


From 56a70faadd8266175535c49f9296c0976662e9d2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:50 +0000
Subject: libclc: Fix abs_diff builtin integer function

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184993 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/SOURCES              | 1 +
 generic/lib/integer/abs_diff.inc | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index b593941..a97213b 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -4,6 +4,7 @@ geometric/dot.cl
 geometric/length.cl
 geometric/normalize.cl
 integer/abs.cl
+integer/abs_diff.cl
 integer/add_sat.cl
 integer/add_sat.ll
 integer/add_sat_impl.ll
diff --git a/generic/lib/integer/abs_diff.inc b/generic/lib/integer/abs_diff.inc
index 93efdba..6ad57ee 100644
--- a/generic/lib/integer/abs_diff.inc
+++ b/generic/lib/integer/abs_diff.inc
@@ -1,3 +1,3 @@
-_CLC_OVERLOAD _CLC_DEF UGENTYPE abs_diff(GENTYPE x) {
+_CLC_OVERLOAD _CLC_DEF UGENTYPE abs_diff(GENTYPE x, GENTYPE y) {
   return __builtin_astype((GENTYPE)(x > y ? x-y : y-x), UGENTYPE);
 }
-- 
cgit v1.2.3


From 213e719bf2c298d99efda128d87c80f0cc737aa9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:20:56 +0000
Subject: libclc: Add clamp() builtin for integer/floating point

Created under a new shared/ directory for functions which are available for
both integer and floating point types.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184994 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h            |  3 +++
 generic/include/clc/shared/clamp.h   |  5 +++++
 generic/include/clc/shared/clamp.inc |  1 +
 generic/lib/SOURCES                  |  1 +
 generic/lib/shared/clamp.cl          | 11 +++++++++++
 generic/lib/shared/clamp.inc         |  3 +++
 6 files changed, 24 insertions(+)
 create mode 100644 generic/include/clc/shared/clamp.h
 create mode 100644 generic/include/clc/shared/clamp.inc
 create mode 100644 generic/lib/shared/clamp.cl
 create mode 100644 generic/lib/shared/clamp.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index f6668a3..80ecd01 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -67,6 +67,9 @@
 #include <clc/integer/max.h>
 #include <clc/integer/sub_sat.h>
 
+/* 6.11.2 and 6.11.3 Shared Integer/Math Functions */
+#include <clc/shared/clamp.h>
+
 /* 6.11.5 Geometric Functions */
 #include <clc/geometric/cross.h>
 #include <clc/geometric/dot.h>
diff --git a/generic/include/clc/shared/clamp.h b/generic/include/clc/shared/clamp.h
new file mode 100644
index 0000000..5c2ebd0
--- /dev/null
+++ b/generic/include/clc/shared/clamp.h
@@ -0,0 +1,5 @@
+#define BODY <clc/shared/clamp.inc>
+#include <clc/integer/gentype.inc>
+
+#define BODY <clc/shared/clamp.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/include/clc/shared/clamp.inc b/generic/include/clc/shared/clamp.inc
new file mode 100644
index 0000000..3e3a435
--- /dev/null
+++ b/generic/include/clc/shared/clamp.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE clamp(GENTYPE x, GENTYPE y, GENTYPE z);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index a97213b..0d477ba 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -18,5 +18,6 @@ math/hypot.cl
 math/mad.cl
 math/max.cl
 relational/any.cl
+shared/clamp.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/shared/clamp.cl b/generic/lib/shared/clamp.cl
new file mode 100644
index 0000000..0e8d223
--- /dev/null
+++ b/generic/lib/shared/clamp.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#define BODY <clamp.inc>
+#include <clc/integer/gentype.inc>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define BODY <clamp.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/lib/shared/clamp.inc b/generic/lib/shared/clamp.inc
new file mode 100644
index 0000000..ed49b8e
--- /dev/null
+++ b/generic/lib/shared/clamp.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF GENTYPE clamp(GENTYPE x, GENTYPE y, GENTYPE z) {
+  return (x > z ? z : (x < y ? y : x));
+}
-- 
cgit v1.2.3


From 15ba684e4eb0d73bc208766869161f0af1f120ee Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:06 +0000
Subject: libclc: Move max builtin to shared/

Max(x,y) is available for all integer/floating types.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184995 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h           |  3 +--
 generic/include/clc/integer/max.h   |  2 --
 generic/include/clc/integer/max.inc |  1 -
 generic/include/clc/math/max.h      |  2 --
 generic/include/clc/math/max.inc    |  1 -
 generic/include/clc/shared/max.h    |  5 +++++
 generic/include/clc/shared/max.inc  |  1 +
 generic/lib/SOURCES                 |  3 +--
 generic/lib/integer/max.cl          |  4 ----
 generic/lib/integer/max.inc         |  3 ---
 generic/lib/math/max.cl             |  8 --------
 generic/lib/math/max.inc            |  3 ---
 generic/lib/shared/max.cl           | 11 +++++++++++
 generic/lib/shared/max.inc          |  3 +++
 14 files changed, 22 insertions(+), 28 deletions(-)
 delete mode 100644 generic/include/clc/integer/max.h
 delete mode 100644 generic/include/clc/integer/max.inc
 delete mode 100644 generic/include/clc/math/max.h
 delete mode 100644 generic/include/clc/math/max.inc
 create mode 100644 generic/include/clc/shared/max.h
 create mode 100644 generic/include/clc/shared/max.inc
 delete mode 100644 generic/lib/integer/max.cl
 delete mode 100644 generic/lib/integer/max.inc
 delete mode 100644 generic/lib/math/max.cl
 delete mode 100644 generic/lib/math/max.inc
 create mode 100644 generic/lib/shared/max.cl
 create mode 100644 generic/lib/shared/max.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 80ecd01..c3d7d59 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -45,7 +45,6 @@
 #include <clc/math/log.h>
 #include <clc/math/log2.h>
 #include <clc/math/mad.h>
-#include <clc/math/max.h>
 #include <clc/math/pow.h>
 #include <clc/math/sin.h>
 #include <clc/math/sqrt.h>
@@ -64,11 +63,11 @@
 #include <clc/integer/abs.h>
 #include <clc/integer/abs_diff.h>
 #include <clc/integer/add_sat.h>
-#include <clc/integer/max.h>
 #include <clc/integer/sub_sat.h>
 
 /* 6.11.2 and 6.11.3 Shared Integer/Math Functions */
 #include <clc/shared/clamp.h>
+#include <clc/shared/max.h>
 
 /* 6.11.5 Geometric Functions */
 #include <clc/geometric/cross.h>
diff --git a/generic/include/clc/integer/max.h b/generic/include/clc/integer/max.h
deleted file mode 100644
index e74a459..0000000
--- a/generic/include/clc/integer/max.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define BODY <clc/integer/max.inc>
-#include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/max.inc b/generic/include/clc/integer/max.inc
deleted file mode 100644
index ce6c6d0..0000000
--- a/generic/include/clc/integer/max.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, GENTYPE b);
diff --git a/generic/include/clc/math/max.h b/generic/include/clc/math/max.h
deleted file mode 100644
index 3d158f1..0000000
--- a/generic/include/clc/math/max.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define BODY <clc/math/max.inc>
-#include <clc/math/gentype.inc>
diff --git a/generic/include/clc/math/max.inc b/generic/include/clc/math/max.inc
deleted file mode 100644
index ce6c6d0..0000000
--- a/generic/include/clc/math/max.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, GENTYPE b);
diff --git a/generic/include/clc/shared/max.h b/generic/include/clc/shared/max.h
new file mode 100644
index 0000000..7967d4a
--- /dev/null
+++ b/generic/include/clc/shared/max.h
@@ -0,0 +1,5 @@
+#define BODY <clc/shared/max.inc>
+#include <clc/integer/gentype.inc>
+
+#define BODY <clc/shared/max.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/include/clc/shared/max.inc b/generic/include/clc/shared/max.inc
new file mode 100644
index 0000000..ce6c6d0
--- /dev/null
+++ b/generic/include/clc/shared/max.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, GENTYPE b);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 0d477ba..f639c83 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -8,7 +8,6 @@ integer/abs_diff.cl
 integer/add_sat.cl
 integer/add_sat.ll
 integer/add_sat_impl.ll
-integer/max.cl
 integer/sub_sat.cl
 integer/sub_sat.ll
 integer/sub_sat_impl.ll
@@ -16,8 +15,8 @@ math/fmax.cl
 math/fmin.cl
 math/hypot.cl
 math/mad.cl
-math/max.cl
 relational/any.cl
 shared/clamp.cl
+shared/max.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/integer/max.cl b/generic/lib/integer/max.cl
deleted file mode 100644
index 89fec7c..0000000
--- a/generic/lib/integer/max.cl
+++ /dev/null
@@ -1,4 +0,0 @@
-#include <clc/clc.h>
-
-#define BODY <max.inc>
-#include <clc/integer/gentype.inc>
diff --git a/generic/lib/integer/max.inc b/generic/lib/integer/max.inc
deleted file mode 100644
index 37409fc..0000000
--- a/generic/lib/integer/max.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, GENTYPE b) {
-  return (a > b ? a : b);
-}
diff --git a/generic/lib/math/max.cl b/generic/lib/math/max.cl
deleted file mode 100644
index d1254a7..0000000
--- a/generic/lib/math/max.cl
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <clc/clc.h>
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#endif
-
-#define BODY <max.inc>
-#include <clc/math/gentype.inc>
diff --git a/generic/lib/math/max.inc b/generic/lib/math/max.inc
deleted file mode 100644
index 37409fc..0000000
--- a/generic/lib/math/max.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, GENTYPE b) {
-  return (a > b ? a : b);
-}
diff --git a/generic/lib/shared/max.cl b/generic/lib/shared/max.cl
new file mode 100644
index 0000000..5a48537
--- /dev/null
+++ b/generic/lib/shared/max.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#define BODY <max.inc>
+#include <clc/integer/gentype.inc>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define BODY <max.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/lib/shared/max.inc b/generic/lib/shared/max.inc
new file mode 100644
index 0000000..37409fc
--- /dev/null
+++ b/generic/lib/shared/max.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, GENTYPE b) {
+  return (a > b ? a : b);
+}
-- 
cgit v1.2.3


From f1bf4b8307660f83653a5cab6f6fe1567a58f4a1 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:13 +0000
Subject: libclc: implement rotate builtin

This implementation does a lot of bit shifting and masking. Suffice to say,
this is somewhat suboptimal... but it does look to produce correct results
(after the piglit tests were corrected for sign extension issues).

Someone who knows LLVM better than I could re-write this more efficiently.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184996 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h               |  1 +
 generic/include/clc/integer/gentype.inc | 11 +++++++++++
 generic/include/clc/integer/rotate.h    |  2 ++
 generic/include/clc/integer/rotate.inc  |  1 +
 generic/lib/SOURCES                     |  1 +
 generic/lib/integer/rotate.cl           |  4 ++++
 generic/lib/integer/rotate.inc          | 35 +++++++++++++++++++++++++++++++++
 7 files changed, 55 insertions(+)
 create mode 100644 generic/include/clc/integer/rotate.h
 create mode 100644 generic/include/clc/integer/rotate.inc
 create mode 100644 generic/lib/integer/rotate.cl
 create mode 100644 generic/lib/integer/rotate.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index c3d7d59..72f518a 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -63,6 +63,7 @@
 #include <clc/integer/abs.h>
 #include <clc/integer/abs_diff.h>
 #include <clc/integer/add_sat.h>
+#include <clc/integer/rotate.h>
 #include <clc/integer/sub_sat.h>
 
 /* 6.11.2 and 6.11.3 Shared Integer/Math Functions */
diff --git a/generic/include/clc/integer/gentype.inc b/generic/include/clc/integer/gentype.inc
index 0b32efd..005b9af 100644
--- a/generic/include/clc/integer/gentype.inc
+++ b/generic/include/clc/integer/gentype.inc
@@ -1,3 +1,4 @@
+#define GENSIZE 8
 #define GENTYPE char
 #define UGENTYPE uchar
 #define SGENTYPE char
@@ -94,6 +95,9 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef GENSIZE
+#define GENSIZE 16
+
 #define GENTYPE short
 #define UGENTYPE ushort
 #define SGENTYPE short
@@ -190,6 +194,9 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef GENSIZE
+#define GENSIZE 32
+
 #define GENTYPE int
 #define UGENTYPE uint
 #define SGENTYPE int
@@ -286,6 +293,9 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef GENSIZE
+#define GENSIZE 64
+
 #define GENTYPE long
 #define UGENTYPE ulong
 #define SGENTYPE long
@@ -382,4 +392,5 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef GENSIZE
 #undef BODY
diff --git a/generic/include/clc/integer/rotate.h b/generic/include/clc/integer/rotate.h
new file mode 100644
index 0000000..e163bc8
--- /dev/null
+++ b/generic/include/clc/integer/rotate.h
@@ -0,0 +1,2 @@
+#define BODY <clc/integer/rotate.inc>
+#include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/rotate.inc b/generic/include/clc/integer/rotate.inc
new file mode 100644
index 0000000..5720e1c
--- /dev/null
+++ b/generic/include/clc/integer/rotate.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE rotate(GENTYPE x, GENTYPE y);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index f639c83..495b3e7 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -8,6 +8,7 @@ integer/abs_diff.cl
 integer/add_sat.cl
 integer/add_sat.ll
 integer/add_sat_impl.ll
+integer/rotate.cl
 integer/sub_sat.cl
 integer/sub_sat.ll
 integer/sub_sat_impl.ll
diff --git a/generic/lib/integer/rotate.cl b/generic/lib/integer/rotate.cl
new file mode 100644
index 0000000..d7eff2b
--- /dev/null
+++ b/generic/lib/integer/rotate.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define BODY <rotate.inc>
+#include <clc/integer/gentype.inc>
diff --git a/generic/lib/integer/rotate.inc b/generic/lib/integer/rotate.inc
new file mode 100644
index 0000000..e83dd51
--- /dev/null
+++ b/generic/lib/integer/rotate.inc
@@ -0,0 +1,35 @@
+/**
+ * Not necessarily optimal... but it produces correct results (at least for int)
+ * If we're lucky, LLVM will recognize the pattern and produce rotate
+ * instructions:
+ * http://llvm.1065342.n5.nabble.com/rotate-td47679.html
+ * 
+ * Eventually, someone should feel free to implement an llvm-specific version
+ */
+
+_CLC_OVERLOAD _CLC_DEF GENTYPE rotate(GENTYPE x, GENTYPE n){
+    //Try to avoid extra work if someone's spinning the value through multiple
+    //full rotations
+    n = n % (GENTYPE)GENSIZE;
+    
+    //Determine if we're doing a right or left shift on each component
+    //The actual shift algorithm is based on a rotate right
+    //e.g. a rotate of int by 5 bits becomes rotate right by 26 bits
+    //     and a rotate of int by -4 bits becomes rotate right by 4
+    GENTYPE amt = (n > (GENTYPE)0 ? (GENTYPE)GENSIZE - n : (GENTYPE)0 - n );
+    
+    //Calculate the bits that will wrap
+    GENTYPE mask = ( (GENTYPE)1 << amt ) - (GENTYPE)1;
+    GENTYPE wrapped_bits = x & mask;
+    
+    //Shift the input value right and then AND a mask that eliminates
+    //sign-extension interference
+    //if the rotate amount is 0, just use ~0 for a mask
+    GENTYPE se_mask = !amt ? ~((GENTYPE)0) : 
+        ( ( (GENTYPE)1 << ((GENTYPE)GENSIZE - amt) ) - (GENTYPE)1 );
+    GENTYPE unwrapped_bits = x >> amt;
+    unwrapped_bits &= se_mask;
+    
+    //Finally shift the input right after moving the wrapped bits into position
+    return unwrapped_bits | (wrapped_bits << ( (GENTYPE)GENSIZE - amt ) );
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 160ef139d86fe26f383518a1ef84b9a216cf207e Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:18 +0000
Subject: Simplify rotate implementation a bit..

Much more understandable/readable as a result, and probably more efficient.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184997 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/integer/gentype.inc | 16 +++++++++++++
 generic/lib/integer/rotate.inc          | 42 ++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/generic/include/clc/integer/gentype.inc b/generic/include/clc/integer/gentype.inc
index 005b9af..dd7d061 100644
--- a/generic/include/clc/integer/gentype.inc
+++ b/generic/include/clc/integer/gentype.inc
@@ -2,7 +2,9 @@
 #define GENTYPE char
 #define UGENTYPE uchar
 #define SGENTYPE char
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
@@ -50,7 +52,9 @@
 #define GENTYPE uchar
 #define UGENTYPE uchar
 #define SGENTYPE char
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
@@ -101,7 +105,9 @@
 #define GENTYPE short
 #define UGENTYPE ushort
 #define SGENTYPE short
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
@@ -149,7 +155,9 @@
 #define GENTYPE ushort
 #define UGENTYPE ushort
 #define SGENTYPE short
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
@@ -200,7 +208,9 @@
 #define GENTYPE int
 #define UGENTYPE uint
 #define SGENTYPE int
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
@@ -248,7 +258,9 @@
 #define GENTYPE uint
 #define UGENTYPE uint
 #define SGENTYPE int
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
@@ -299,7 +311,9 @@
 #define GENTYPE long
 #define UGENTYPE ulong
 #define SGENTYPE long
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
@@ -347,7 +361,9 @@
 #define GENTYPE ulong
 #define UGENTYPE ulong
 #define SGENTYPE long
+#define SCALAR 1
 #include BODY
+#undef SCALAR
 #undef GENTYPE
 #undef UGENTYPE
 #undef SGENTYPE
diff --git a/generic/lib/integer/rotate.inc b/generic/lib/integer/rotate.inc
index e83dd51..7792a97 100644
--- a/generic/lib/integer/rotate.inc
+++ b/generic/lib/integer/rotate.inc
@@ -11,25 +11,25 @@ _CLC_OVERLOAD _CLC_DEF GENTYPE rotate(GENTYPE x, GENTYPE n){
     //Try to avoid extra work if someone's spinning the value through multiple
     //full rotations
     n = n % (GENTYPE)GENSIZE;
-    
-    //Determine if we're doing a right or left shift on each component
-    //The actual shift algorithm is based on a rotate right
-    //e.g. a rotate of int by 5 bits becomes rotate right by 26 bits
-    //     and a rotate of int by -4 bits becomes rotate right by 4
-    GENTYPE amt = (n > (GENTYPE)0 ? (GENTYPE)GENSIZE - n : (GENTYPE)0 - n );
-    
-    //Calculate the bits that will wrap
-    GENTYPE mask = ( (GENTYPE)1 << amt ) - (GENTYPE)1;
-    GENTYPE wrapped_bits = x & mask;
-    
-    //Shift the input value right and then AND a mask that eliminates
-    //sign-extension interference
-    //if the rotate amount is 0, just use ~0 for a mask
-    GENTYPE se_mask = !amt ? ~((GENTYPE)0) : 
-        ( ( (GENTYPE)1 << ((GENTYPE)GENSIZE - amt) ) - (GENTYPE)1 );
-    GENTYPE unwrapped_bits = x >> amt;
-    unwrapped_bits &= se_mask;
-    
-    //Finally shift the input right after moving the wrapped bits into position
-    return unwrapped_bits | (wrapped_bits << ( (GENTYPE)GENSIZE - amt ) );
+
+#ifdef SCALAR
+    if (n > 0){
+        return (x << n) | (((UGENTYPE)x) >> (GENSIZE - n));
+    } else if (n == 0){
+        return x;
+    } else {
+        return ( (((UGENTYPE)x) >> -n) | (x << (GENSIZE + n)) );
+    }
+#else
+    UGENTYPE x_1 = __builtin_astype(x, UGENTYPE);
+
+    UGENTYPE amt;
+    amt = (n < (GENTYPE)0 ? __builtin_astype((GENTYPE)0-n, UGENTYPE) : (UGENTYPE)0);
+    x_1 = (x_1 >> amt) | (x_1 << ((UGENTYPE)GENSIZE - amt));
+
+    amt = (n < (GENTYPE)0 ? (UGENTYPE)0 : __builtin_astype(n, UGENTYPE));
+    x_1 = (x_1 << amt) | (x_1 >> ((UGENTYPE)GENSIZE - amt));
+
+    return __builtin_astype(x_1, GENTYPE);
+#endif
 }
\ No newline at end of file
-- 
cgit v1.2.3


From a6c4ea315053dc20ced08f9e0f9efb545e2179b7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:22 +0000
Subject: Add a TODO note.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184998 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/integer/rotate.inc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/generic/lib/integer/rotate.inc b/generic/lib/integer/rotate.inc
index 7792a97..b34ffed 100644
--- a/generic/lib/integer/rotate.inc
+++ b/generic/lib/integer/rotate.inc
@@ -21,6 +21,10 @@ _CLC_OVERLOAD _CLC_DEF GENTYPE rotate(GENTYPE x, GENTYPE n){
         return ( (((UGENTYPE)x) >> -n) | (x << (GENSIZE + n)) );
     }
 #else
+    //XXX: There's a lot of __builtin_astype calls to cast everything to
+    //     unsigned ... This should be improved so that if GENTYPE==UGENTYPE, no
+    //     casts are required.
+    
     UGENTYPE x_1 = __builtin_astype(x, UGENTYPE);
 
     UGENTYPE amt;
-- 
cgit v1.2.3


From 7a2f843ed11f41c55b94c57769e9987457979cf0 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:25 +0000
Subject: Add a another TODO note.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@184999 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/integer/rotate.inc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/generic/lib/integer/rotate.inc b/generic/lib/integer/rotate.inc
index b34ffed..2aa6cc9 100644
--- a/generic/lib/integer/rotate.inc
+++ b/generic/lib/integer/rotate.inc
@@ -27,6 +27,9 @@ _CLC_OVERLOAD _CLC_DEF GENTYPE rotate(GENTYPE x, GENTYPE n){
     
     UGENTYPE x_1 = __builtin_astype(x, UGENTYPE);
 
+    //XXX: Is (UGENTYPE >> SGENTYPE) | (UGENTYPE << SGENTYPE) legal?
+    //     If so, then combine the amt and shifts into a single set of statements
+    
     UGENTYPE amt;
     amt = (n < (GENTYPE)0 ? __builtin_astype((GENTYPE)0-n, UGENTYPE) : (UGENTYPE)0);
     x_1 = (x_1 >> amt) | (x_1 << ((UGENTYPE)GENSIZE - amt));
-- 
cgit v1.2.3


From 547d4512a5515de8dbbe433a4502f2e0ff20fce5 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:31 +0000
Subject: libclc: Rename [add|sub]_sat.ll to [add|sub]_sat_if.ll

configure.py allows overloading *.cl with *.ll, but will only ever build
the first file listed in SOURCES of ${file}.cl and ${file}.ll

add_sat, sub_sat, (and the soon to be submitted clz) all define interfaces in
${function_name}.ll which are implemented in ${function_name}_impl.ll.

Renaming the interface files is enough to get them to build again, fixing
CL usage of these functions.

Tested on clover/r600g.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185000 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/SOURCES               |  4 +--
 generic/lib/integer/add_sat.ll    | 55 ---------------------------------------
 generic/lib/integer/add_sat_if.ll | 55 +++++++++++++++++++++++++++++++++++++++
 generic/lib/integer/sub_sat.ll    | 55 ---------------------------------------
 generic/lib/integer/sub_sat_if.ll | 55 +++++++++++++++++++++++++++++++++++++++
 ptx/lib/OVERRIDES                 |  2 ++
 6 files changed, 114 insertions(+), 112 deletions(-)
 delete mode 100644 generic/lib/integer/add_sat.ll
 create mode 100644 generic/lib/integer/add_sat_if.ll
 delete mode 100644 generic/lib/integer/sub_sat.ll
 create mode 100644 generic/lib/integer/sub_sat_if.ll
 create mode 100644 ptx/lib/OVERRIDES

diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 495b3e7..7f27ec4 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -6,11 +6,11 @@ geometric/normalize.cl
 integer/abs.cl
 integer/abs_diff.cl
 integer/add_sat.cl
-integer/add_sat.ll
+integer/add_sat_if.ll
 integer/add_sat_impl.ll
 integer/rotate.cl
 integer/sub_sat.cl
-integer/sub_sat.ll
+integer/sub_sat_if.ll
 integer/sub_sat_impl.ll
 math/fmax.cl
 math/fmin.cl
diff --git a/generic/lib/integer/add_sat.ll b/generic/lib/integer/add_sat.ll
deleted file mode 100644
index bcbe4c0..0000000
--- a/generic/lib/integer/add_sat.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-declare i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y)
-
-define i8 @__clc_add_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
-  %call = call i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y)
-  ret i8 %call
-}
-
-declare i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y)
-
-define i8 @__clc_add_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
-  %call = call i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y)
-  ret i8 %call
-}
-
-declare i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y)
-
-define i16 @__clc_add_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
-  %call = call i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y)
-  ret i16 %call
-}
-
-declare i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y)
-
-define i16 @__clc_add_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
-  %call = call i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y)
-  ret i16 %call
-}
-
-declare i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y)
-
-define i32 @__clc_add_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
-  %call = call i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y)
-  ret i32 %call
-}
-
-declare i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y)
-
-define i32 @__clc_add_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
-  %call = call i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y)
-  ret i32 %call
-}
-
-declare i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y)
-
-define i64 @__clc_add_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
-  %call = call i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y)
-  ret i64 %call
-}
-
-declare i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y)
-
-define i64 @__clc_add_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
-  %call = call i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y)
-  ret i64 %call
-}
diff --git a/generic/lib/integer/add_sat_if.ll b/generic/lib/integer/add_sat_if.ll
new file mode 100644
index 0000000..bcbe4c0
--- /dev/null
+++ b/generic/lib/integer/add_sat_if.ll
@@ -0,0 +1,55 @@
+declare i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y)
+
+define i8 @__clc_add_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+  %call = call i8 @__clc_add_sat_impl_s8(i8 %x, i8 %y)
+  ret i8 %call
+}
+
+declare i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y)
+
+define i8 @__clc_add_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+  %call = call i8 @__clc_add_sat_impl_u8(i8 %x, i8 %y)
+  ret i8 %call
+}
+
+declare i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y)
+
+define i16 @__clc_add_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+  %call = call i16 @__clc_add_sat_impl_s16(i16 %x, i16 %y)
+  ret i16 %call
+}
+
+declare i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y)
+
+define i16 @__clc_add_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+  %call = call i16 @__clc_add_sat_impl_u16(i16 %x, i16 %y)
+  ret i16 %call
+}
+
+declare i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y)
+
+define i32 @__clc_add_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+  %call = call i32 @__clc_add_sat_impl_s32(i32 %x, i32 %y)
+  ret i32 %call
+}
+
+declare i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y)
+
+define i32 @__clc_add_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+  %call = call i32 @__clc_add_sat_impl_u32(i32 %x, i32 %y)
+  ret i32 %call
+}
+
+declare i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y)
+
+define i64 @__clc_add_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+  %call = call i64 @__clc_add_sat_impl_s64(i64 %x, i64 %y)
+  ret i64 %call
+}
+
+declare i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y)
+
+define i64 @__clc_add_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+  %call = call i64 @__clc_add_sat_impl_u64(i64 %x, i64 %y)
+  ret i64 %call
+}
diff --git a/generic/lib/integer/sub_sat.ll b/generic/lib/integer/sub_sat.ll
deleted file mode 100644
index 7252574..0000000
--- a/generic/lib/integer/sub_sat.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-declare i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y)
-
-define i8 @__clc_sub_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
-  %call = call i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y)
-  ret i8 %call
-}
-
-declare i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y)
-
-define i8 @__clc_sub_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
-  %call = call i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y)
-  ret i8 %call
-}
-
-declare i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y)
-
-define i16 @__clc_sub_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
-  %call = call i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y)
-  ret i16 %call
-}
-
-declare i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y)
-
-define i16 @__clc_sub_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
-  %call = call i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y)
-  ret i16 %call
-}
-
-declare i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y)
-
-define i32 @__clc_sub_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
-  %call = call i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y)
-  ret i32 %call
-}
-
-declare i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y)
-
-define i32 @__clc_sub_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
-  %call = call i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y)
-  ret i32 %call
-}
-
-declare i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y)
-
-define i64 @__clc_sub_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
-  %call = call i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y)
-  ret i64 %call
-}
-
-declare i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y)
-
-define i64 @__clc_sub_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
-  %call = call i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y)
-  ret i64 %call
-}
diff --git a/generic/lib/integer/sub_sat_if.ll b/generic/lib/integer/sub_sat_if.ll
new file mode 100644
index 0000000..7252574
--- /dev/null
+++ b/generic/lib/integer/sub_sat_if.ll
@@ -0,0 +1,55 @@
+declare i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y)
+
+define i8 @__clc_sub_sat_s8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+  %call = call i8 @__clc_sub_sat_impl_s8(i8 %x, i8 %y)
+  ret i8 %call
+}
+
+declare i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y)
+
+define i8 @__clc_sub_sat_u8(i8 %x, i8 %y) nounwind readnone alwaysinline {
+  %call = call i8 @__clc_sub_sat_impl_u8(i8 %x, i8 %y)
+  ret i8 %call
+}
+
+declare i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y)
+
+define i16 @__clc_sub_sat_s16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+  %call = call i16 @__clc_sub_sat_impl_s16(i16 %x, i16 %y)
+  ret i16 %call
+}
+
+declare i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y)
+
+define i16 @__clc_sub_sat_u16(i16 %x, i16 %y) nounwind readnone alwaysinline {
+  %call = call i16 @__clc_sub_sat_impl_u16(i16 %x, i16 %y)
+  ret i16 %call
+}
+
+declare i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y)
+
+define i32 @__clc_sub_sat_s32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+  %call = call i32 @__clc_sub_sat_impl_s32(i32 %x, i32 %y)
+  ret i32 %call
+}
+
+declare i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y)
+
+define i32 @__clc_sub_sat_u32(i32 %x, i32 %y) nounwind readnone alwaysinline {
+  %call = call i32 @__clc_sub_sat_impl_u32(i32 %x, i32 %y)
+  ret i32 %call
+}
+
+declare i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y)
+
+define i64 @__clc_sub_sat_s64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+  %call = call i64 @__clc_sub_sat_impl_s64(i64 %x, i64 %y)
+  ret i64 %call
+}
+
+declare i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y)
+
+define i64 @__clc_sub_sat_u64(i64 %x, i64 %y) nounwind readnone alwaysinline {
+  %call = call i64 @__clc_sub_sat_impl_u64(i64 %x, i64 %y)
+  ret i64 %call
+}
diff --git a/ptx/lib/OVERRIDES b/ptx/lib/OVERRIDES
new file mode 100644
index 0000000..475162c
--- /dev/null
+++ b/ptx/lib/OVERRIDES
@@ -0,0 +1,2 @@
+integer/add_sat_if.ll
+integer/sub_sat_if.ll
-- 
cgit v1.2.3


From a8b6a9c793b7b3e0fd39fe63445b88f23cee3fa3 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:38 +0000
Subject: libclc: implement initial version of min()

This doesn't handle the integer cases for min(vector, scalar).

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185001 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h          |  1 +
 generic/include/clc/shared/min.h   |  5 +++++
 generic/include/clc/shared/min.inc |  1 +
 generic/lib/SOURCES                |  1 +
 generic/lib/shared/min.cl          | 11 +++++++++++
 generic/lib/shared/min.inc         |  3 +++
 6 files changed, 22 insertions(+)
 create mode 100644 generic/include/clc/shared/min.h
 create mode 100644 generic/include/clc/shared/min.inc
 create mode 100644 generic/lib/shared/min.cl
 create mode 100644 generic/lib/shared/min.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 72f518a..74f1126 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -69,6 +69,7 @@
 /* 6.11.2 and 6.11.3 Shared Integer/Math Functions */
 #include <clc/shared/clamp.h>
 #include <clc/shared/max.h>
+#include <clc/shared/min.h>
 
 /* 6.11.5 Geometric Functions */
 #include <clc/geometric/cross.h>
diff --git a/generic/include/clc/shared/min.h b/generic/include/clc/shared/min.h
new file mode 100644
index 0000000..e16b45d
--- /dev/null
+++ b/generic/include/clc/shared/min.h
@@ -0,0 +1,5 @@
+#define BODY <clc/shared/min.inc>
+#include <clc/integer/gentype.inc>
+
+#define BODY <clc/shared/min.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/include/clc/shared/min.inc b/generic/include/clc/shared/min.inc
new file mode 100644
index 0000000..3bc9880
--- /dev/null
+++ b/generic/include/clc/shared/min.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE min(GENTYPE a, GENTYPE b);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 7f27ec4..eac6c60 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -19,5 +19,6 @@ math/mad.cl
 relational/any.cl
 shared/clamp.cl
 shared/max.cl
+shared/min.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/shared/min.cl b/generic/lib/shared/min.cl
new file mode 100644
index 0000000..49481cb
--- /dev/null
+++ b/generic/lib/shared/min.cl
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#define BODY <min.inc>
+#include <clc/integer/gentype.inc>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define BODY <min.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/lib/shared/min.inc b/generic/lib/shared/min.inc
new file mode 100644
index 0000000..b99bc35
--- /dev/null
+++ b/generic/lib/shared/min.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF GENTYPE min(GENTYPE a, GENTYPE b) {
+  return (a < b ? a : b);
+}
-- 
cgit v1.2.3


From b05141061f4f14eb497f0ee1a578fb30fb220c4d Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:44 +0000
Subject: libclc: Implement the min(vec, scalar) version of the min builtin.

Checks if the current GENTYPE is scalar, and if not, then defines a separate
implementation of the function which casts the second arg to vector before
proceeding.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185002 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/integer/gentype.inc | 23 +++++++++++++++++++++++
 generic/include/clc/math/gentype.inc    |  8 ++++++++
 generic/include/clc/shared/min.inc      |  4 ++++
 generic/lib/shared/min.inc              |  6 ++++++
 4 files changed, 41 insertions(+)

diff --git a/generic/include/clc/integer/gentype.inc b/generic/include/clc/integer/gentype.inc
index dd7d061..95a37d5 100644
--- a/generic/include/clc/integer/gentype.inc
+++ b/generic/include/clc/integer/gentype.inc
@@ -1,4 +1,8 @@
+//These 2 defines only change when switching between data sizes or base types to
+//keep this file manageable.
 #define GENSIZE 8
+#define SCALAR_GENTYPE char
+
 #define GENTYPE char
 #define UGENTYPE uchar
 #define SGENTYPE char
@@ -49,6 +53,9 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef SCALAR_GENTYPE
+#define SCALAR_GENTYPE uchar
+
 #define GENTYPE uchar
 #define UGENTYPE uchar
 #define SGENTYPE char
@@ -101,6 +108,8 @@
 
 #undef GENSIZE
 #define GENSIZE 16
+#undef SCALAR_GENTYPE
+#define SCALAR_GENTYPE short
 
 #define GENTYPE short
 #define UGENTYPE ushort
@@ -152,6 +161,9 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef SCALAR_GENTYPE
+#define SCALAR_GENTYPE ushort
+
 #define GENTYPE ushort
 #define UGENTYPE ushort
 #define SGENTYPE short
@@ -204,6 +216,8 @@
 
 #undef GENSIZE
 #define GENSIZE 32
+#undef SCALAR_GENTYPE
+#define SCALAR_GENTYPE int
 
 #define GENTYPE int
 #define UGENTYPE uint
@@ -255,6 +269,9 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef SCALAR_GENTYPE
+#define SCALAR_GENTYPE uint
+
 #define GENTYPE uint
 #define UGENTYPE uint
 #define SGENTYPE int
@@ -307,6 +324,8 @@
 
 #undef GENSIZE
 #define GENSIZE 64
+#undef SCALAR_GENTYPE
+#define SCALAR_GENTYPE long
 
 #define GENTYPE long
 #define UGENTYPE ulong
@@ -358,6 +377,9 @@
 #undef UGENTYPE
 #undef SGENTYPE
 
+#undef SCALAR_GENTYPE
+#define SCALAR_GENTYPE ulong
+
 #define GENTYPE ulong
 #define UGENTYPE ulong
 #define SGENTYPE long
@@ -409,4 +431,5 @@
 #undef SGENTYPE
 
 #undef GENSIZE
+#undef SCALAR_GENTYPE
 #undef BODY
diff --git a/generic/include/clc/math/gentype.inc b/generic/include/clc/math/gentype.inc
index b525c4b..4ed2151 100644
--- a/generic/include/clc/math/gentype.inc
+++ b/generic/include/clc/math/gentype.inc
@@ -1,3 +1,5 @@
+#define SCALAR_GENTYPE float
+
 #define GENTYPE float
 #define SCALAR
 #include BODY
@@ -24,7 +26,11 @@
 #include BODY
 #undef GENTYPE
 
+#undef SCALAR_GENTYPE
+
 #ifdef cl_khr_fp64
+#define SCALAR_GENTYPE double
+
 #define SCALAR
 #define GENTYPE double
 #include BODY
@@ -50,6 +56,8 @@
 #define GENTYPE double16
 #include BODY
 #undef GENTYPE
+
+#undef SCALAR_GENTYPE
 #endif
 
 #undef BODY
diff --git a/generic/include/clc/shared/min.inc b/generic/include/clc/shared/min.inc
index 3bc9880..cf3afaf 100644
--- a/generic/include/clc/shared/min.inc
+++ b/generic/include/clc/shared/min.inc
@@ -1 +1,5 @@
 _CLC_OVERLOAD _CLC_DECL GENTYPE min(GENTYPE a, GENTYPE b);
+
+#ifndef SCALAR
+_CLC_OVERLOAD _CLC_DECL GENTYPE min(GENTYPE a, SCALAR_GENTYPE b);
+#endif
\ No newline at end of file
diff --git a/generic/lib/shared/min.inc b/generic/lib/shared/min.inc
index b99bc35..58a22e1 100644
--- a/generic/lib/shared/min.inc
+++ b/generic/lib/shared/min.inc
@@ -1,3 +1,9 @@
 _CLC_OVERLOAD _CLC_DEF GENTYPE min(GENTYPE a, GENTYPE b) {
   return (a < b ? a : b);
 }
+
+#ifndef SCALAR
+_CLC_OVERLOAD _CLC_DEF GENTYPE min(GENTYPE a, SCALAR_GENTYPE b) {
+  return (a < (GENTYPE)b ? a : (GENTYPE)b);
+}
+#endif
-- 
cgit v1.2.3


From 7424a799130628aae0c61e78299e4f5e06b2bd74 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:49 +0000
Subject: libclc: Add clamp(vec, scalar, scalar) and max(vec, scalar)

For any GENTYPE that isn't scalar, we need to implement a mixed
vector/scalar version of clamp/max.

This depends on the min() patches I sent to the list a few minutes ago.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185003 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/shared/clamp.inc | 4 ++++
 generic/include/clc/shared/max.inc   | 4 ++++
 generic/lib/shared/clamp.inc         | 6 ++++++
 generic/lib/shared/max.inc           | 6 ++++++
 4 files changed, 20 insertions(+)

diff --git a/generic/include/clc/shared/clamp.inc b/generic/include/clc/shared/clamp.inc
index 3e3a435..67c8142 100644
--- a/generic/include/clc/shared/clamp.inc
+++ b/generic/include/clc/shared/clamp.inc
@@ -1 +1,5 @@
 _CLC_OVERLOAD _CLC_DECL GENTYPE clamp(GENTYPE x, GENTYPE y, GENTYPE z);
+
+#ifndef SCALAR
+_CLC_OVERLOAD _CLC_DECL GENTYPE clamp(GENTYPE x, SCALAR_GENTYPE y, SCALAR_GENTYPE z);
+#endif
diff --git a/generic/include/clc/shared/max.inc b/generic/include/clc/shared/max.inc
index ce6c6d0..9fe73c4 100644
--- a/generic/include/clc/shared/max.inc
+++ b/generic/include/clc/shared/max.inc
@@ -1 +1,5 @@
 _CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, GENTYPE b);
+
+#ifndef SCALAR
+_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, SCALAR_GENTYPE b);
+#endif
diff --git a/generic/lib/shared/clamp.inc b/generic/lib/shared/clamp.inc
index ed49b8e..58370d3 100644
--- a/generic/lib/shared/clamp.inc
+++ b/generic/lib/shared/clamp.inc
@@ -1,3 +1,9 @@
 _CLC_OVERLOAD _CLC_DEF GENTYPE clamp(GENTYPE x, GENTYPE y, GENTYPE z) {
   return (x > z ? z : (x < y ? y : x));
 }
+
+#ifndef SCALAR
+_CLC_OVERLOAD _CLC_DEF GENTYPE clamp(GENTYPE x, SCALAR_GENTYPE y, SCALAR_GENTYPE z) {
+  return (x > (GENTYPE)z ? (GENTYPE)z : (x < (GENTYPE)y ? (GENTYPE)y : x));
+}
+#endif
\ No newline at end of file
diff --git a/generic/lib/shared/max.inc b/generic/lib/shared/max.inc
index 37409fc..6a12b6f 100644
--- a/generic/lib/shared/max.inc
+++ b/generic/lib/shared/max.inc
@@ -1,3 +1,9 @@
 _CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, GENTYPE b) {
   return (a > b ? a : b);
 }
+
+#ifndef SCALAR
+_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, SCALAR_GENTYPE b) {
+  return (a > (GENTYPE)b ? a : (GENTYPE)b);
+}
+#endif
\ No newline at end of file
-- 
cgit v1.2.3


From 80f66b1b0e84b953e0cd55984beaff616617394f Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:21:55 +0000
Subject: libclc: Implement clz() builtin

Squashed commit of the following:

commit a0df0a0e86c55c1bdc0b9c0f5a739e5adef4b056
Author: Aaron Watry <awatry@gmail.com>
Date:   Mon Apr 15 18:42:04 2013 -0500

    libclc: Rename clz.ll to clz_if.ll to ensure it gets built.

    configure.py treats files that have the same name with the .cl and .ll
    extensions as overriding eachother.

    E.g. If you have clz.cl and clz.ll both specified to be built in the same
    SOURCES file, only the first file listed will actually be built.

    Since the contents of clz.ll were an interface that is implemented in
    clz_impl.ll, rename clz.ll to clz_if.ll to make sure that the interface is
    built.

commit 931b62bed05c58f737de625bd415af09571a6a5a
Author: Aaron Watry <awatry@gmail.com>
Date:   Sat Apr 13 12:32:54 2013 -0500

    libclc: llvm assembly implementation of clz

    Untested... currently crashes in the same manner as add_sat.

commit 6ef0b7b0b6d2e5584086b4b9a9243743b2e0538f
Author: Aaron Watry <awatry@gmail.com>
Date:   Sat Mar 23 12:35:27 2013 -0500

    libclc: Add stub clz builtin

    For scalar int/uint, attempt to use the clz llvm builtin.. for all others
    return 0 until an actual implementation is finished.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185004 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h           |  1 +
 generic/include/clc/integer/clz.h   |  2 ++
 generic/include/clc/integer/clz.inc |  1 +
 generic/lib/SOURCES                 |  3 ++
 generic/lib/integer/clz.cl          | 52 +++++++++++++++++++++++++++++++++++
 generic/lib/integer/clz_if.ll       | 55 +++++++++++++++++++++++++++++++++++++
 generic/lib/integer/clz_impl.ll     | 44 +++++++++++++++++++++++++++++
 7 files changed, 158 insertions(+)
 create mode 100644 generic/include/clc/integer/clz.h
 create mode 100644 generic/include/clc/integer/clz.inc
 create mode 100644 generic/lib/integer/clz.cl
 create mode 100644 generic/lib/integer/clz_if.ll
 create mode 100644 generic/lib/integer/clz_impl.ll

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 74f1126..d2858a8 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -63,6 +63,7 @@
 #include <clc/integer/abs.h>
 #include <clc/integer/abs_diff.h>
 #include <clc/integer/add_sat.h>
+#include <clc/integer/clz.h>
 #include <clc/integer/rotate.h>
 #include <clc/integer/sub_sat.h>
 
diff --git a/generic/include/clc/integer/clz.h b/generic/include/clc/integer/clz.h
new file mode 100644
index 0000000..5708eb4
--- /dev/null
+++ b/generic/include/clc/integer/clz.h
@@ -0,0 +1,2 @@
+#define BODY <clc/integer/clz.inc>
+#include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/clz.inc b/generic/include/clc/integer/clz.inc
new file mode 100644
index 0000000..ac73a31
--- /dev/null
+++ b/generic/include/clc/integer/clz.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL GENTYPE clz(GENTYPE x);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index eac6c60..59eb9bb 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -8,6 +8,9 @@ integer/abs_diff.cl
 integer/add_sat.cl
 integer/add_sat_if.ll
 integer/add_sat_impl.ll
+integer/clz.cl
+integer/clz_if.ll
+integer/clz_impl.ll
 integer/rotate.cl
 integer/sub_sat.cl
 integer/sub_sat_if.ll
diff --git a/generic/lib/integer/clz.cl b/generic/lib/integer/clz.cl
new file mode 100644
index 0000000..83ef2dd
--- /dev/null
+++ b/generic/lib/integer/clz.cl
@@ -0,0 +1,52 @@
+#include <clc/clc.h>
+
+// From clz.ll
+_CLC_DECL char   __clc_clz_s8(char);
+_CLC_DECL uchar  __clc_clz_u8(uchar);
+_CLC_DECL short  __clc_clz_s16(short);
+_CLC_DECL ushort __clc_clz_u16(ushort);
+_CLC_DECL int    __clc_clz_s32(int);
+_CLC_DECL uint   __clc_clz_u32(uint);
+_CLC_DECL long   __clc_clz_s64(long);
+_CLC_DECL ulong  __clc_clz_u64(ulong);
+
+_CLC_OVERLOAD _CLC_DEF char clz(char x) {
+  return __clc_clz_s8(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF uchar clz(uchar x) {
+  return __clc_clz_u8(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF short clz(short x) {
+  return __clc_clz_s16(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF ushort clz(ushort x) {
+  return __clc_clz_u16(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF int clz(int x) {
+  return __clc_clz_s32(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF uint clz(uint x) {
+  return __clc_clz_u32(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF long clz(long x) {
+  return __clc_clz_s64(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong clz(ulong x) {
+  return __clc_clz_u64(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, clz, char)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, clz, uchar)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, clz, short)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, clz, ushort)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, clz, int)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, clz, uint)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, clz, long)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, clz, ulong)
diff --git a/generic/lib/integer/clz_if.ll b/generic/lib/integer/clz_if.ll
new file mode 100644
index 0000000..23dfc74
--- /dev/null
+++ b/generic/lib/integer/clz_if.ll
@@ -0,0 +1,55 @@
+declare i8 @__clc_clz_impl_s8(i8 %x)
+
+define i8 @__clc_clz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @__clc_clz_impl_s8(i8 %x)
+  ret i8 %call
+}
+
+declare i8 @__clc_clz_impl_u8(i8 %x)
+
+define i8 @__clc_clz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @__clc_clz_impl_u8(i8 %x)
+  ret i8 %call
+}
+
+declare i16 @__clc_clz_impl_s16(i16 %x)
+
+define i16 @__clc_clz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @__clc_clz_impl_s16(i16 %x)
+  ret i16 %call
+}
+
+declare i16 @__clc_clz_impl_u16(i16 %x)
+
+define i16 @__clc_clz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @__clc_clz_impl_u16(i16 %x)
+  ret i16 %call
+}
+
+declare i32 @__clc_clz_impl_s32(i32 %x)
+
+define i32 @__clc_clz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @__clc_clz_impl_s32(i32 %x)
+  ret i32 %call
+}
+
+declare i32 @__clc_clz_impl_u32(i32 %x)
+
+define i32 @__clc_clz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @__clc_clz_impl_u32(i32 %x)
+  ret i32 %call
+}
+
+declare i64 @__clc_clz_impl_s64(i64 %x)
+
+define i64 @__clc_clz_s64(i64 %x) nounwind readnone alwaysinline {
+  %call = call i64 @__clc_clz_impl_s64(i64 %x)
+  ret i64 %call
+}
+
+declare i64 @__clc_clz_impl_u64(i64 %x)
+
+define i64 @__clc_clz_u64(i64 %x) nounwind readnone alwaysinline {
+  %call = call i64 @__clc_clz_impl_u64(i64 %x)
+  ret i64 %call
+}
diff --git a/generic/lib/integer/clz_impl.ll b/generic/lib/integer/clz_impl.ll
new file mode 100644
index 0000000..b5c3d98
--- /dev/null
+++ b/generic/lib/integer/clz_impl.ll
@@ -0,0 +1,44 @@
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @__clc_clz_impl_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i8 @__clc_clz_impl_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i16 @__clc_clz_impl_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i16 @__clc_clz_impl_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i32 @__clc_clz_impl_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i32 @__clc_clz_impl_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i64 @__clc_clz_impl_s64(i64 %x) nounwind readnone alwaysinline {
+  %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+  ret i64 %call
+}
+
+define i64 @__clc_clz_impl_u64(i64 %x) nounwind readnone alwaysinline {
+  %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+  ret i64 %call
+}
-- 
cgit v1.2.3


From 5c45e8a2550704e2df01d4f1bdf0d7f6f0016f17 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:22:00 +0000
Subject: r600: Fix implementations of get_group_id.ll and get_local_size.ll

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185005 91177308-0d34-0410-b5e6-96231b3b80d8
---
 r600/lib/workitem/get_group_id.ll   | 12 ++++++------
 r600/lib/workitem/get_local_size.ll | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/r600/lib/workitem/get_group_id.ll b/r600/lib/workitem/get_group_id.ll
index 5131648..0dc86e5 100644
--- a/r600/lib/workitem/get_group_id.ll
+++ b/r600/lib/workitem/get_group_id.ll
@@ -1,17 +1,17 @@
-declare i32 @llvm.r600.read.local.size.x() nounwind readnone
-declare i32 @llvm.r600.read.local.size.y() nounwind readnone
-declare i32 @llvm.r600.read.local.size.z() nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() nounwind readnone
+declare i32 @llvm.r600.read.tgid.z() nounwind readnone
 
 define i32 @get_group_id(i32 %dim) nounwind readnone alwaysinline {
   switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
 x_dim:
-  %x = call i32 @llvm.r600.read.local.size.x() nounwind readnone
+  %x = call i32 @llvm.r600.read.tgid.x() nounwind readnone
   ret i32 %x
 y_dim:
-  %y = call i32 @llvm.r600.read.local.size.y() nounwind readnone
+  %y = call i32 @llvm.r600.read.tgid.y() nounwind readnone
   ret i32 %y
 z_dim:
-  %z = call i32 @llvm.r600.read.local.size.z() nounwind readnone
+  %z = call i32 @llvm.r600.read.tgid.z() nounwind readnone
   ret i32 %z
 default:
   ret i32 0
diff --git a/r600/lib/workitem/get_local_size.ll b/r600/lib/workitem/get_local_size.ll
index 6a71f75..0a98de6 100644
--- a/r600/lib/workitem/get_local_size.ll
+++ b/r600/lib/workitem/get_local_size.ll
@@ -1,17 +1,17 @@
-declare i32 @llvm.r600.read.tgid.x() nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() nounwind readnone
-declare i32 @llvm.r600.read.tgid.z() nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() nounwind readnone
 
 define i32 @get_local_size(i32 %dim) nounwind readnone alwaysinline {
   switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
 x_dim:
-  %x = call i32 @llvm.r600.read.tgid.x() nounwind readnone
+  %x = call i32 @llvm.r600.read.local.size.x() nounwind readnone
   ret i32 %x
 y_dim:
-  %y = call i32 @llvm.r600.read.tgid.y() nounwind readnone
+  %y = call i32 @llvm.r600.read.local.size.y() nounwind readnone
   ret i32 %y
 z_dim:
-  %z = call i32 @llvm.r600.read.tgid.z() nounwind readnone
+  %z = call i32 @llvm.r600.read.local.size.z() nounwind readnone
   ret i32 %z
 default:
   ret i32 0
-- 
cgit v1.2.3


From 6b73fd445be70e84dc214d88f7802ef800a76b4f Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:22:05 +0000
Subject: libclc: Initial vload implementation

Should work for all targets and data types.  Completely unoptimized.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185006 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h          |  1 +
 generic/include/clc/shared/vload.h | 37 ++++++++++++++++++++++++++++++
 generic/lib/SOURCES                |  1 +
 generic/lib/shared/vload.cl        | 47 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+)
 create mode 100644 generic/include/clc/shared/vload.h
 create mode 100644 generic/lib/shared/vload.cl

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index d2858a8..7937003 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -71,6 +71,7 @@
 #include <clc/shared/clamp.h>
 #include <clc/shared/max.h>
 #include <clc/shared/min.h>
+#include <clc/shared/vload.h>
 
 /* 6.11.5 Geometric Functions */
 #include <clc/geometric/cross.h>
diff --git a/generic/include/clc/shared/vload.h b/generic/include/clc/shared/vload.h
new file mode 100644
index 0000000..93d0750
--- /dev/null
+++ b/generic/include/clc/shared/vload.h
@@ -0,0 +1,37 @@
+#define _CLC_VLOAD_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
+  _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##WIDTH(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
+
+#define _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, ADDR_SPACE) \
+  _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
+  _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
+  _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
+  _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
+  _CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
+
+#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
+  _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __private) \
+  _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __local) \
+  _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __constant) \
+  _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __global) \
+
+#define _CLC_VECTOR_VLOAD_PRIM() \
+    _CLC_VECTOR_VLOAD_PRIM1(char) \
+    _CLC_VECTOR_VLOAD_PRIM1(uchar) \
+    _CLC_VECTOR_VLOAD_PRIM1(short) \
+    _CLC_VECTOR_VLOAD_PRIM1(ushort) \
+    _CLC_VECTOR_VLOAD_PRIM1(int) \
+    _CLC_VECTOR_VLOAD_PRIM1(uint) \
+    _CLC_VECTOR_VLOAD_PRIM1(long) \
+    _CLC_VECTOR_VLOAD_PRIM1(ulong) \
+    _CLC_VECTOR_VLOAD_PRIM1(float) \
+        
+#ifdef cl_khr_fp64
+#define _CLC_VECTOR_VLOAD() \
+  _CLC_VECTOR_VLOAD_PRIM1(double) \
+  _CLC_VECTOR_VLOAD_PRIM()
+#else
+#define _CLC_VECTOR_VLOAD() \
+  _CLC_VECTOR_VLOAD_PRIM()
+#endif
+
+_CLC_VECTOR_VLOAD()
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 59eb9bb..5d9e3fa 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -23,5 +23,6 @@ relational/any.cl
 shared/clamp.cl
 shared/max.cl
 shared/min.cl
+shared/vload.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
new file mode 100644
index 0000000..24d8240
--- /dev/null
+++ b/generic/lib/shared/vload.cl
@@ -0,0 +1,47 @@
+#include <clc/clc.h>
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##2)(x[offset] , x[offset+1]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##3)(x[offset] , x[offset+1], x[offset+2]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##4)(x[offset], x[offset+1], x[offset+2], x[offset+3]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##8)(vload4(offset, x), vload4(offset+4, x)); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \
+  } \
+
+#define VLOAD_ADDR_SPACES(SCALAR_GENTYPE) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __private) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __local) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
+
+#define VLOAD_TYPES() \
+    VLOAD_ADDR_SPACES(char) \
+    VLOAD_ADDR_SPACES(uchar) \
+    VLOAD_ADDR_SPACES(short) \
+    VLOAD_ADDR_SPACES(ushort) \
+    VLOAD_ADDR_SPACES(int) \
+    VLOAD_ADDR_SPACES(uint) \
+    VLOAD_ADDR_SPACES(long) \
+    VLOAD_ADDR_SPACES(ulong) \
+    VLOAD_ADDR_SPACES(float) \
+
+VLOAD_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    VLOAD_ADDR_SPACES(double)
+#endif
+
-- 
cgit v1.2.3


From 1a71238de08d32b405073ae2d818b479b4f73a77 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:22:11 +0000
Subject: libclc: Initial vstore implementation

Assumes that the target supports byte-addressable stores.

Completely unoptimized.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185007 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h           |  1 +
 generic/include/clc/shared/vstore.h | 36 ++++++++++++++++++++++++
 generic/lib/SOURCES                 |  1 +
 generic/lib/shared/vstore.cl        | 56 +++++++++++++++++++++++++++++++++++++
 4 files changed, 94 insertions(+)
 create mode 100644 generic/include/clc/shared/vstore.h
 create mode 100644 generic/lib/shared/vstore.cl

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 7937003..10d30e0 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -72,6 +72,7 @@
 #include <clc/shared/max.h>
 #include <clc/shared/min.h>
 #include <clc/shared/vload.h>
+#include <clc/shared/vstore.h>
 
 /* 6.11.5 Geometric Functions */
 #include <clc/geometric/cross.h>
diff --git a/generic/include/clc/shared/vstore.h b/generic/include/clc/shared/vstore.h
new file mode 100644
index 0000000..1f784f8
--- /dev/null
+++ b/generic/include/clc/shared/vstore.h
@@ -0,0 +1,36 @@
+#define _CLC_VSTORE_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
+  _CLC_OVERLOAD _CLC_DECL void vstore##WIDTH(VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out);
+
+#define _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
+
+#define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \
+  _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __private) \
+  _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __local) \
+  _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __global) \
+
+#define _CLC_VECTOR_VSTORE_PRIM() \
+    _CLC_VECTOR_VSTORE_PRIM1(char) \
+    _CLC_VECTOR_VSTORE_PRIM1(uchar) \
+    _CLC_VECTOR_VSTORE_PRIM1(short) \
+    _CLC_VECTOR_VSTORE_PRIM1(ushort) \
+    _CLC_VECTOR_VSTORE_PRIM1(int) \
+    _CLC_VECTOR_VSTORE_PRIM1(uint) \
+    _CLC_VECTOR_VSTORE_PRIM1(long) \
+    _CLC_VECTOR_VSTORE_PRIM1(ulong) \
+    _CLC_VECTOR_VSTORE_PRIM1(float) \
+        
+#ifdef cl_khr_fp64
+#define _CLC_VECTOR_VSTORE() \
+  _CLC_VECTOR_VSTORE_PRIM1(double) \
+  _CLC_VECTOR_VSTORE_PRIM()
+#else
+#define _CLC_VECTOR_VSTORE() \
+  _CLC_VECTOR_VSTORE_PRIM()
+#endif
+
+_CLC_VECTOR_VSTORE()
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 5d9e3fa..50cc9bd 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -24,5 +24,6 @@ shared/clamp.cl
 shared/max.cl
 shared/min.cl
 shared/vload.cl
+shared/vstore.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
new file mode 100644
index 0000000..e88ccc5
--- /dev/null
+++ b/generic/lib/shared/vstore.cl
@@ -0,0 +1,56 @@
+#include <clc/clc.h>
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    mem[offset] = vec.s0; \
+    mem[offset+1] = vec.s1; \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    mem[offset] = vec.s0; \
+    mem[offset+1] = vec.s1; \
+    mem[offset+2] = vec.s2; \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    mem[offset] = vec.s0; \
+    mem[offset+1] = vec.s1; \
+    mem[offset+2] = vec.s2; \
+    mem[offset+3] = vec.s3; \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    vstore4(vec.lo, offset, mem); \
+    vstore4(vec.hi, offset+4, mem); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    vstore8(vec.lo, offset, mem); \
+    vstore8(vec.hi, offset+8, mem); \
+  } \
+
+#define VSTORE_ADDR_SPACES(SCALAR_GENTYPE) \
+    VSTORE_VECTORIZE(SCALAR_GENTYPE, __private) \
+    VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
+    VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
+
+#define VSTORE_TYPES() \
+    VSTORE_ADDR_SPACES(char) \
+    VSTORE_ADDR_SPACES(uchar) \
+    VSTORE_ADDR_SPACES(short) \
+    VSTORE_ADDR_SPACES(ushort) \
+    VSTORE_ADDR_SPACES(int) \
+    VSTORE_ADDR_SPACES(uint) \
+    VSTORE_ADDR_SPACES(long) \
+    VSTORE_ADDR_SPACES(ulong) \
+    VSTORE_ADDR_SPACES(float) \
+
+VSTORE_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    VSTORE_ADDR_SPACES(double)
+#endif
+
-- 
cgit v1.2.3


From eadd80c3348f3b45daab44ebdbfdb00ab4ce0ff9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:22:15 +0000
Subject: libclc: Add assembly versions of vload for global int4/8/16

The assembly should be generic, but at least currently R600 only supports
32-bit loads of int1/4, and I believe that only global is well-supported.

R600 lowers the 8/16 component vectors to multiple 4-bit loads.

The unoptimized C versions of the other stuff is left in place.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185008 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/SOURCES              |  2 ++
 generic/lib/shared/vload.cl      | 53 +++++++++++++++++++++++++++++++++--
 generic/lib/shared/vload_if.ll   | 60 ++++++++++++++++++++++++++++++++++++++++
 generic/lib/shared/vload_impl.ll | 49 ++++++++++++++++++++++++++++++++
 4 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 generic/lib/shared/vload_if.ll
 create mode 100644 generic/lib/shared/vload_impl.ll

diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 50cc9bd..9f6acf3 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -24,6 +24,8 @@ shared/clamp.cl
 shared/max.cl
 shared/min.cl
 shared/vload.cl
+shared/vload_if.ll
+shared/vload_impl.ll
 shared/vstore.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
index 24d8240..f6ebd37 100644
--- a/generic/lib/shared/vload.cl
+++ b/generic/lib/shared/vload.cl
@@ -27,13 +27,12 @@
     VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
     VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
 
+//int/uint are special... see below
 #define VLOAD_TYPES() \
     VLOAD_ADDR_SPACES(char) \
     VLOAD_ADDR_SPACES(uchar) \
     VLOAD_ADDR_SPACES(short) \
     VLOAD_ADDR_SPACES(ushort) \
-    VLOAD_ADDR_SPACES(int) \
-    VLOAD_ADDR_SPACES(uint) \
     VLOAD_ADDR_SPACES(long) \
     VLOAD_ADDR_SPACES(ulong) \
     VLOAD_ADDR_SPACES(float) \
@@ -45,3 +44,53 @@ VLOAD_TYPES()
     VLOAD_ADDR_SPACES(double)
 #endif
 
+VLOAD_VECTORIZE(int, __private)
+VLOAD_VECTORIZE(int, __local)
+VLOAD_VECTORIZE(int, __constant)
+VLOAD_VECTORIZE(uint, __private)
+VLOAD_VECTORIZE(uint, __local)
+VLOAD_VECTORIZE(uint, __constant)
+
+_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
+  return (int2)(x[offset] , x[offset+1]);
+}
+_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
+  return (int3)(vload2(offset, x), x[offset+2]);
+}
+_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
+  return (uint2)(x[offset] , x[offset+1]);
+}
+_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
+  return (uint3)(vload2(offset, x), x[offset+2]);
+}
+        
+/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
+ * they aren't actually overridden here
+ */
+_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
+_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
+_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
+
+_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
+  return __clc_vload4_int__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
+  return __clc_vload8_int__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
+  return __clc_vload16_int__global(offset, x);
+}
+
+_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
+_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
+_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
+
+_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
+  return __clc_vload4_uint__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
+  return __clc_vload8_uint__global(offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
+  return __clc_vload16_uint__global(offset, x);
+}
\ No newline at end of file
diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
new file mode 100644
index 0000000..2634d37
--- /dev/null
+++ b/generic/lib/shared/vload_if.ll
@@ -0,0 +1,60 @@
+;Start int global vload
+
+declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
+declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
+declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
+declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
+declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
+
+define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
+  ret <2 x i32> %call
+}
+
+define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
+  ret <3 x i32> %call
+}
+
+define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
+  ret <4 x i32> %call
+}
+
+define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
+  ret <8 x i32> %call
+}
+
+define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
+  ret <16 x i32> %call
+}
+
+
+;Start uint global vload
+
+define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
+  ret <2 x i32> %call
+}
+
+define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
+  ret <3 x i32> %call
+}
+
+define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
+  ret <4 x i32> %call
+}
+
+define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
+  ret <8 x i32> %call
+}
+
+define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
+  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
+  ret <16 x i32> %call
+}
diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
new file mode 100644
index 0000000..ae719e0
--- /dev/null
+++ b/generic/lib/shared/vload_impl.ll
@@ -0,0 +1,49 @@
+; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
+
+define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
+  %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret <2 x i32> %4
+}
+
+define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
+  %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret <3 x i32> %4
+}
+
+define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
+  %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret <4 x i32> %4
+}
+
+define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
+  %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret <8 x i32> %4
+}
+
+define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
+  %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret <16 x i32> %4
+}
+
+!1 = metadata !{metadata !"char", metadata !5}
+!2 = metadata !{metadata !"short", metadata !5}
+!3 = metadata !{metadata !"int", metadata !5}
+!4 = metadata !{metadata !"long", metadata !5}
+!5 = metadata !{metadata !"omnipotent char", metadata !6}
+!6 = metadata !{metadata !"Simple C/C++ TBAA"}
+
-- 
cgit v1.2.3


From 4ab90369cbe2ef489df88726d9be223fb1847c6f Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 26 Jun 2013 18:22:20 +0000
Subject: libclc: Add assembly versions of vstore for global [u]int4/8/16

The assembly should be generic, but at least currently R600 only supports
32-bit stores of [u]int1/4, and I believe that only global is well-supported.

R600 lowers the 8/16 component stores to multiple 4-component stores.

The unoptimized C versions of the other stuff is left in place.

Patch by: Aaron Watry

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185009 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/SOURCES               |  2 ++
 generic/lib/shared/vstore.cl      | 63 +++++++++++++++++++++++++++++++++++----
 generic/lib/shared/vstore_if.ll   | 59 ++++++++++++++++++++++++++++++++++++
 generic/lib/shared/vstore_impl.ll | 50 +++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 6 deletions(-)
 create mode 100644 generic/lib/shared/vstore_if.ll
 create mode 100644 generic/lib/shared/vstore_impl.ll

diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 9f6acf3..8cda14a 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -27,5 +27,7 @@ shared/vload.cl
 shared/vload_if.ll
 shared/vload_impl.ll
 shared/vstore.cl
+shared/vstore_if.ll
+shared/vstore_impl.ll
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
index e88ccc5..5b84f47 100644
--- a/generic/lib/shared/vstore.cl
+++ b/generic/lib/shared/vstore.cl
@@ -15,10 +15,8 @@
   } \
 \
   _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
-    mem[offset] = vec.s0; \
-    mem[offset+1] = vec.s1; \
-    mem[offset+2] = vec.s2; \
-    mem[offset+3] = vec.s3; \
+    vstore2(vec.lo, offset, mem); \
+    vstore2(vec.hi, offset+2, mem); \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
@@ -36,13 +34,12 @@
     VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
     VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
 
+//int/uint are special... see below
 #define VSTORE_TYPES() \
     VSTORE_ADDR_SPACES(char) \
     VSTORE_ADDR_SPACES(uchar) \
     VSTORE_ADDR_SPACES(short) \
     VSTORE_ADDR_SPACES(ushort) \
-    VSTORE_ADDR_SPACES(int) \
-    VSTORE_ADDR_SPACES(uint) \
     VSTORE_ADDR_SPACES(long) \
     VSTORE_ADDR_SPACES(ulong) \
     VSTORE_ADDR_SPACES(float) \
@@ -54,3 +51,57 @@ VSTORE_TYPES()
     VSTORE_ADDR_SPACES(double)
 #endif
 
+VSTORE_VECTORIZE(int, __private)
+VSTORE_VECTORIZE(int, __local)
+VSTORE_VECTORIZE(uint, __private)
+VSTORE_VECTORIZE(uint, __local)
+
+_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *mem) {
+    mem[offset] = vec.s0;
+    mem[offset+1] = vec.s1;
+}
+_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
+    mem[offset] = vec.s0;
+    mem[offset+1] = vec.s1;
+    mem[offset+2] = vec.s2;
+}
+_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *mem) {
+    mem[offset] = vec.s0;
+    mem[offset+1] = vec.s1;
+}
+_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
+    mem[offset] = vec.s0;
+    mem[offset+1] = vec.s1;
+    mem[offset+2] = vec.s2;
+}
+
+/*Note: R600 probably doesn't support store <2 x ?> and <3 x ?>... so
+ * they aren't actually overridden here... lowest-common-denominator
+ */
+_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *);
+_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *);
+_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *);
+
+_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) {
+    __clc_vstore4_int__global(vec, offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) {
+    __clc_vstore8_int__global(vec, offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) {
+    __clc_vstore16_int__global(vec, offset, x);
+}
+
+_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *);
+_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *);
+_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *);
+
+_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) {
+    __clc_vstore4_uint__global(vec, offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) {
+    __clc_vstore8_uint__global(vec, offset, x);
+}
+_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) {
+    __clc_vstore16_uint__global(vec, offset, x);
+}
diff --git a/generic/lib/shared/vstore_if.ll b/generic/lib/shared/vstore_if.ll
new file mode 100644
index 0000000..30eb552
--- /dev/null
+++ b/generic/lib/shared/vstore_if.ll
@@ -0,0 +1,59 @@
+;Start int global vstore
+
+declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
+declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
+declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
+declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
+declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
+
+define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+
+;Start uint global vstore
+define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
+
+define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
+  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
+  ret void
+}
\ No newline at end of file
diff --git a/generic/lib/shared/vstore_impl.ll b/generic/lib/shared/vstore_impl.ll
new file mode 100644
index 0000000..3baab5e
--- /dev/null
+++ b/generic/lib/shared/vstore_impl.ll
@@ -0,0 +1,50 @@
+; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint
+
+define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
+  store <2 x i32> %vec, <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret void
+}
+
+define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
+  store <3 x i32> %vec, <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret void
+}
+
+define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
+  store <4 x i32> %vec, <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret void
+}
+
+define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
+  store <8 x i32> %vec, <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret void
+}
+
+define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = ptrtoint i32 addrspace(1)* %addr to i32
+  %2 = add i32 %1, %offset
+  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
+  store <16 x i32> %vec, <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
+  ret void
+}
+
+
+!1 = metadata !{metadata !"char", metadata !5}
+!2 = metadata !{metadata !"short", metadata !5}
+!3 = metadata !{metadata !"int", metadata !5}
+!4 = metadata !{metadata !"long", metadata !5}
+!5 = metadata !{metadata !"omnipotent char", metadata !6}
+!6 = metadata !{metadata !"Simple C/C++ TBAA"}
+
-- 
cgit v1.2.3


From a3c59c98a5ae7d4af0c2ac0194422a466a8891cc Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 8 Jul 2013 17:26:33 +0000
Subject: Add bitselect() builtin

Reviewed-By: Aaron Watry <awatry@gmail.com>

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185836 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h                  | 1 +
 generic/include/clc/relational/bitselect.h | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 generic/include/clc/relational/bitselect.h

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 10d30e0..1ce97ad 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -82,6 +82,7 @@
 
 /* 6.11.6 Relational Functions */
 #include <clc/relational/any.h>
+#include <clc/relational/bitselect.h>
 #include <clc/relational/select.h>
 
 /* 6.11.8 Synchronization Functions */
diff --git a/generic/include/clc/relational/bitselect.h b/generic/include/clc/relational/bitselect.h
new file mode 100644
index 0000000..e91cbfd
--- /dev/null
+++ b/generic/include/clc/relational/bitselect.h
@@ -0,0 +1 @@
+#define bitselect(x, y, z) ((x) ^ ((z) & ((y) ^ (x))))
-- 
cgit v1.2.3


From efd6599fc741c849dfce8e1e312e28fb33857315 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 8 Jul 2013 17:26:39 +0000
Subject: Implement barrier() builtin

Reviewed and Tested-by: Aaron Watry <awatry@gmail.com>

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185837 91177308-0d34-0410-b5e6-96231b3b80d8
---
 r600/lib/SOURCES                         |  2 ++
 r600/lib/synchronization/barrier.cl      | 15 +++++++++++++++
 r600/lib/synchronization/barrier_impl.ll | 12 ++++++++++++
 3 files changed, 29 insertions(+)
 create mode 100644 r600/lib/synchronization/barrier.cl
 create mode 100644 r600/lib/synchronization/barrier_impl.ll

diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
index af8c8c8..16ef3ac 100644
--- a/r600/lib/SOURCES
+++ b/r600/lib/SOURCES
@@ -2,3 +2,5 @@ workitem/get_group_id.ll
 workitem/get_local_size.ll
 workitem/get_local_id.ll
 workitem/get_global_size.ll
+synchronization/barrier.cl
+synchronization/barrier_impl.ll
diff --git a/r600/lib/synchronization/barrier.cl b/r600/lib/synchronization/barrier.cl
new file mode 100644
index 0000000..ac0b4b3
--- /dev/null
+++ b/r600/lib/synchronization/barrier.cl
@@ -0,0 +1,15 @@
+
+#include <clc/clc.h>
+
+void barrier_local(void);
+void barrier_global(void);
+
+void barrier(cl_mem_fence_flags flags) {
+  if (flags & CLK_LOCAL_MEM_FENCE) {
+    barrier_local();
+  }
+
+  if (flags & CLK_GLOBAL_MEM_FENCE) {
+    barrier_global();
+  }
+}
diff --git a/r600/lib/synchronization/barrier_impl.ll b/r600/lib/synchronization/barrier_impl.ll
new file mode 100644
index 0000000..99ac018
--- /dev/null
+++ b/r600/lib/synchronization/barrier_impl.ll
@@ -0,0 +1,12 @@
+declare void @llvm.AMDGPU.barrier.local() nounwind
+declare void @llvm.AMDGPU.barrier.global() nounwind
+
+define void @barrier_local() nounwind alwaysinline {
+  call void @llvm.AMDGPU.barrier.local()
+  ret void
+}
+
+define void @barrier_global() nounwind alwaysinline {
+  call void @llvm.AMDGPU.barrier.global()
+  ret void
+}
-- 
cgit v1.2.3


From 5217211f119b1ddef4d541431c73a7ab7315636e Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 8 Jul 2013 17:27:02 +0000
Subject: Add __CLC_ prefix to all macro definitions in headers

libclc was defining and undefing GENTYPE and several other macros with
common names in its header files.  This was preventing applications from
defining macros with identical names as command line arguments to the
compiler, because the definitions in the header files were masking the
macros defined as compiler arguements.

Reviewed-by: Aaron Watry <awatry@gmail.com>

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185838 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/gentype.inc             |  48 +-
 generic/include/clc/geometric/distance.h    |   2 +-
 generic/include/clc/geometric/dot.h         |   2 +-
 generic/include/clc/geometric/dot.inc       |   2 +-
 generic/include/clc/geometric/floatn.inc    |  58 +-
 generic/include/clc/geometric/length.h      |   2 +-
 generic/include/clc/geometric/length.inc    |   2 +-
 generic/include/clc/geometric/normalize.h   |   2 +-
 generic/include/clc/geometric/normalize.inc |   2 +-
 generic/include/clc/integer/abs.h           |   2 +-
 generic/include/clc/integer/abs.inc         |   2 +-
 generic/include/clc/integer/abs_diff.h      |   2 +-
 generic/include/clc/integer/abs_diff.inc    |   2 +-
 generic/include/clc/integer/add_sat.h       |   2 +-
 generic/include/clc/integer/add_sat.inc     |   2 +-
 generic/include/clc/integer/clz.h           |   2 +-
 generic/include/clc/integer/clz.inc         |   2 +-
 generic/include/clc/integer/gentype.inc     | 866 ++++++++++++++--------------
 generic/include/clc/integer/rotate.h        |   2 +-
 generic/include/clc/integer/rotate.inc      |   2 +-
 generic/include/clc/integer/sub_sat.h       |   2 +-
 generic/include/clc/integer/sub_sat.inc     |   2 +-
 generic/include/clc/math/binary_decl.inc    |   6 +-
 generic/include/clc/math/binary_intrin.inc  |  28 +-
 generic/include/clc/math/ceil.h             |   4 +-
 generic/include/clc/math/cos.h              |   4 +-
 generic/include/clc/math/exp2.h             |   4 +-
 generic/include/clc/math/fabs.h             |   4 +-
 generic/include/clc/math/floor.h            |   4 +-
 generic/include/clc/math/fma.h              |   4 +-
 generic/include/clc/math/fmax.h             |   8 +-
 generic/include/clc/math/fmin.h             |   8 +-
 generic/include/clc/math/gentype.inc        |  90 +--
 generic/include/clc/math/hypot.h            |   2 +-
 generic/include/clc/math/hypot.inc          |   2 +-
 generic/include/clc/math/log2.h             |   4 +-
 generic/include/clc/math/mad.h              |   2 +-
 generic/include/clc/math/mad.inc            |   2 +-
 generic/include/clc/math/pow.h              |   4 +-
 generic/include/clc/math/sin.h              |   4 +-
 generic/include/clc/math/sqrt.h             |   4 +-
 generic/include/clc/math/ternary_intrin.inc |  28 +-
 generic/include/clc/math/unary_decl.inc     |   2 +-
 generic/include/clc/math/unary_intrin.inc   |  28 +-
 generic/include/clc/shared/clamp.h          |   4 +-
 generic/include/clc/shared/clamp.inc        |   6 +-
 generic/include/clc/shared/max.h            |   4 +-
 generic/include/clc/shared/max.inc          |   6 +-
 generic/include/clc/shared/min.h            |   4 +-
 generic/include/clc/shared/min.inc          |   8 +-
 generic/lib/geometric/length.cl             |   2 +-
 generic/lib/geometric/length.inc            |   2 +-
 generic/lib/geometric/normalize.cl          |   2 +-
 generic/lib/geometric/normalize.inc         |   2 +-
 generic/lib/integer/abs.cl                  |   2 +-
 generic/lib/integer/abs.inc                 |   4 +-
 generic/lib/integer/abs_diff.cl             |   2 +-
 generic/lib/integer/abs_diff.inc            |   4 +-
 generic/lib/integer/rotate.cl               |   2 +-
 generic/lib/integer/rotate.inc              |  30 +-
 generic/lib/math/binary_impl.inc            |  12 +-
 generic/lib/math/fmax.cl                    |   2 +-
 generic/lib/math/fmin.cl                    |   2 +-
 generic/lib/math/hypot.cl                   |   2 +-
 generic/lib/math/hypot.inc                  |   2 +-
 generic/lib/math/mad.cl                     |   2 +-
 generic/lib/math/mad.inc                    |   2 +-
 generic/lib/shared/clamp.cl                 |   4 +-
 generic/lib/shared/clamp.inc                |  10 +-
 generic/lib/shared/max.cl                   |   4 +-
 generic/lib/shared/max.inc                  |  10 +-
 generic/lib/shared/min.cl                   |   4 +-
 generic/lib/shared/min.inc                  |   8 +-
 generic/lib/shared/vload.cl                 |  12 +-
 generic/lib/shared/vstore.cl                |   8 +-
 75 files changed, 712 insertions(+), 712 deletions(-)

diff --git a/generic/include/clc/gentype.inc b/generic/include/clc/gentype.inc
index 4506920..1ff064a 100644
--- a/generic/include/clc/gentype.inc
+++ b/generic/include/clc/gentype.inc
@@ -1,51 +1,51 @@
-#define GENTYPE float
+#define __CLC_GENTYPE float
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE float2
+#define __CLC_GENTYPE float2
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE float3
+#define __CLC_GENTYPE float3
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE float4
+#define __CLC_GENTYPE float4
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE float8
+#define __CLC_GENTYPE float8
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE float16
+#define __CLC_GENTYPE float16
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
 #ifdef cl_khr_fp64
-#define GENTYPE double
+#define __CLC_GENTYPE double
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE double2
+#define __CLC_GENTYPE double2
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE double3
+#define __CLC_GENTYPE double3
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE double4
+#define __CLC_GENTYPE double4
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE double8
+#define __CLC_GENTYPE double8
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 
-#define GENTYPE double16
+#define __CLC_GENTYPE double16
 #include BODY
-#undef GENTYPE
+#undef __CLC_GENTYPE
 #endif
 
 #undef BODY
diff --git a/generic/include/clc/geometric/distance.h b/generic/include/clc/geometric/distance.h
index 1660dcd..3e91332 100644
--- a/generic/include/clc/geometric/distance.h
+++ b/generic/include/clc/geometric/distance.h
@@ -1,2 +1,2 @@
-#define BODY <clc/geometric/distance.inc>
+#define __CLC_BODY <clc/geometric/distance.inc>
 #include <clc/geometric/floatn.inc>
diff --git a/generic/include/clc/geometric/dot.h b/generic/include/clc/geometric/dot.h
index 5f0464f..7f65fed 100644
--- a/generic/include/clc/geometric/dot.h
+++ b/generic/include/clc/geometric/dot.h
@@ -1,2 +1,2 @@
-#define BODY <clc/geometric/dot.inc>
+#define __CLC_BODY <clc/geometric/dot.inc>
 #include <clc/geometric/floatn.inc>
diff --git a/generic/include/clc/geometric/dot.inc b/generic/include/clc/geometric/dot.inc
index 69c53a9..34245e2 100644
--- a/generic/include/clc/geometric/dot.inc
+++ b/generic/include/clc/geometric/dot.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL FLOAT dot(FLOATN p0, FLOATN p1);
+_CLC_OVERLOAD _CLC_DECL __CLC_FLOAT dot(__CLC_FLOATN p0, __CLC_FLOATN p1);
diff --git a/generic/include/clc/geometric/floatn.inc b/generic/include/clc/geometric/floatn.inc
index e84545a..fb7a9ae 100644
--- a/generic/include/clc/geometric/floatn.inc
+++ b/generic/include/clc/geometric/floatn.inc
@@ -1,45 +1,45 @@
-#define FLOAT float
+#define __CLC_FLOAT float
 
-#define FLOATN float
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN float
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#define FLOATN float2
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN float2
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#define FLOATN float3
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN float3
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#define FLOATN float4
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN float4
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#undef FLOAT
+#undef __CLC_FLOAT
 
 #ifdef cl_khr_fp64
 
-#define FLOAT double
+#define __CLC_FLOAT double
 
-#define FLOATN double
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN double
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#define FLOATN double2
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN double2
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#define FLOATN double3
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN double3
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#define FLOATN double4
-#include BODY
-#undef FLOATN
+#define __CLC_FLOATN double4
+#include __CLC_BODY
+#undef __CLC_FLOATN
 
-#undef FLOAT
+#undef __CLC_FLOAT
 
 #endif
 
-#undef BODY
+#undef __CLC_BODY
diff --git a/generic/include/clc/geometric/length.h b/generic/include/clc/geometric/length.h
index fbba634..cb992b9 100644
--- a/generic/include/clc/geometric/length.h
+++ b/generic/include/clc/geometric/length.h
@@ -1,2 +1,2 @@
-#define BODY <clc/geometric/length.inc>
+#define __CLC_BODY <clc/geometric/length.inc>
 #include <clc/geometric/floatn.inc>
diff --git a/generic/include/clc/geometric/length.inc b/generic/include/clc/geometric/length.inc
index a9fa2d5..c2d95e8 100644
--- a/generic/include/clc/geometric/length.inc
+++ b/generic/include/clc/geometric/length.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL FLOAT length(FLOATN p0);
+_CLC_OVERLOAD _CLC_DECL __CLC_FLOAT length(__CLC_FLOATN p0);
diff --git a/generic/include/clc/geometric/normalize.h b/generic/include/clc/geometric/normalize.h
index 3aaf61c..dccff9b 100644
--- a/generic/include/clc/geometric/normalize.h
+++ b/generic/include/clc/geometric/normalize.h
@@ -1,2 +1,2 @@
-#define BODY <clc/geometric/normalize.inc>
+#define __CLC_BODY <clc/geometric/normalize.inc>
 #include <clc/geometric/floatn.inc>
diff --git a/generic/include/clc/geometric/normalize.inc b/generic/include/clc/geometric/normalize.inc
index 7b4f69d..6eb1315 100644
--- a/generic/include/clc/geometric/normalize.inc
+++ b/generic/include/clc/geometric/normalize.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL FLOATN normalize(FLOATN p);
+_CLC_OVERLOAD _CLC_DECL __CLC_FLOATN normalize(__CLC_FLOATN p);
diff --git a/generic/include/clc/integer/abs.h b/generic/include/clc/integer/abs.h
index 7592e4b..77a4cbe 100644
--- a/generic/include/clc/integer/abs.h
+++ b/generic/include/clc/integer/abs.h
@@ -1,2 +1,2 @@
-#define BODY <clc/integer/abs.inc>
+#define __CLC_BODY <clc/integer/abs.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/abs.inc b/generic/include/clc/integer/abs.inc
index bfbec20..952bce7 100644
--- a/generic/include/clc/integer/abs.inc
+++ b/generic/include/clc/integer/abs.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL UGENTYPE abs(GENTYPE x);
+_CLC_OVERLOAD _CLC_DECL __CLC_U_GENTYPE abs(__CLC_GENTYPE x);
diff --git a/generic/include/clc/integer/abs_diff.h b/generic/include/clc/integer/abs_diff.h
index 16fb465..3f3b4b4 100644
--- a/generic/include/clc/integer/abs_diff.h
+++ b/generic/include/clc/integer/abs_diff.h
@@ -1,2 +1,2 @@
-#define BODY <clc/integer/abs_diff.inc>
+#define __CLC_BODY <clc/integer/abs_diff.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/abs_diff.inc b/generic/include/clc/integer/abs_diff.inc
index 8cfdb9b..e844d46 100644
--- a/generic/include/clc/integer/abs_diff.inc
+++ b/generic/include/clc/integer/abs_diff.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL UGENTYPE abs_diff(GENTYPE x, GENTYPE y);
+_CLC_OVERLOAD _CLC_DECL __CLC_U_GENTYPE abs_diff(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/generic/include/clc/integer/add_sat.h b/generic/include/clc/integer/add_sat.h
index 9dbe12a..2e5e698 100644
--- a/generic/include/clc/integer/add_sat.h
+++ b/generic/include/clc/integer/add_sat.h
@@ -1,2 +1,2 @@
-#define BODY <clc/integer/add_sat.inc>
+#define __CLC_BODY <clc/integer/add_sat.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/add_sat.inc b/generic/include/clc/integer/add_sat.inc
index 2ea8a83..913841a 100644
--- a/generic/include/clc/integer/add_sat.inc
+++ b/generic/include/clc/integer/add_sat.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE add_sat(GENTYPE x, GENTYPE y);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE add_sat(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/generic/include/clc/integer/clz.h b/generic/include/clc/integer/clz.h
index 5708eb4..f7cdbf7 100644
--- a/generic/include/clc/integer/clz.h
+++ b/generic/include/clc/integer/clz.h
@@ -1,2 +1,2 @@
-#define BODY <clc/integer/clz.inc>
+#define __CLC_BODY <clc/integer/clz.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/clz.inc b/generic/include/clc/integer/clz.inc
index ac73a31..45826d1 100644
--- a/generic/include/clc/integer/clz.inc
+++ b/generic/include/clc/integer/clz.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE clz(GENTYPE x);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE clz(__CLC_GENTYPE x);
diff --git a/generic/include/clc/integer/gentype.inc b/generic/include/clc/integer/gentype.inc
index 95a37d5..6f4d699 100644
--- a/generic/include/clc/integer/gentype.inc
+++ b/generic/include/clc/integer/gentype.inc
@@ -1,435 +1,435 @@
 //These 2 defines only change when switching between data sizes or base types to
 //keep this file manageable.
-#define GENSIZE 8
-#define SCALAR_GENTYPE char
-
-#define GENTYPE char
-#define UGENTYPE uchar
-#define SGENTYPE char
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE char2
-#define UGENTYPE uchar2
-#define SGENTYPE char2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE char3
-#define UGENTYPE uchar3
-#define SGENTYPE char3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE char4
-#define UGENTYPE uchar4
-#define SGENTYPE char4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE char8
-#define UGENTYPE uchar8
-#define SGENTYPE char8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE char16
-#define UGENTYPE uchar16
-#define SGENTYPE char16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef SCALAR_GENTYPE
-#define SCALAR_GENTYPE uchar
-
-#define GENTYPE uchar
-#define UGENTYPE uchar
-#define SGENTYPE char
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uchar2
-#define UGENTYPE uchar2
-#define SGENTYPE char2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uchar3
-#define UGENTYPE uchar3
-#define SGENTYPE char3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uchar4
-#define UGENTYPE uchar4
-#define SGENTYPE char4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uchar8
-#define UGENTYPE uchar8
-#define SGENTYPE char8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uchar16
-#define UGENTYPE uchar16
-#define SGENTYPE char16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef GENSIZE
-#define GENSIZE 16
-#undef SCALAR_GENTYPE
-#define SCALAR_GENTYPE short
-
-#define GENTYPE short
-#define UGENTYPE ushort
-#define SGENTYPE short
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE short2
-#define UGENTYPE ushort2
-#define SGENTYPE short2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE short3
-#define UGENTYPE ushort3
-#define SGENTYPE short3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE short4
-#define UGENTYPE ushort4
-#define SGENTYPE short4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE short8
-#define UGENTYPE ushort8
-#define SGENTYPE short8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE short16
-#define UGENTYPE ushort16
-#define SGENTYPE short16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef SCALAR_GENTYPE
-#define SCALAR_GENTYPE ushort
-
-#define GENTYPE ushort
-#define UGENTYPE ushort
-#define SGENTYPE short
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ushort2
-#define UGENTYPE ushort2
-#define SGENTYPE short2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ushort3
-#define UGENTYPE ushort3
-#define SGENTYPE short3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ushort4
-#define UGENTYPE ushort4
-#define SGENTYPE short4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ushort8
-#define UGENTYPE ushort8
-#define SGENTYPE short8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ushort16
-#define UGENTYPE ushort16
-#define SGENTYPE short16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef GENSIZE
-#define GENSIZE 32
-#undef SCALAR_GENTYPE
-#define SCALAR_GENTYPE int
-
-#define GENTYPE int
-#define UGENTYPE uint
-#define SGENTYPE int
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE int2
-#define UGENTYPE uint2
-#define SGENTYPE int2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE int3
-#define UGENTYPE uint3
-#define SGENTYPE int3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE int4
-#define UGENTYPE uint4
-#define SGENTYPE int4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE int8
-#define UGENTYPE uint8
-#define SGENTYPE int8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE int16
-#define UGENTYPE uint16
-#define SGENTYPE int16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef SCALAR_GENTYPE
-#define SCALAR_GENTYPE uint
-
-#define GENTYPE uint
-#define UGENTYPE uint
-#define SGENTYPE int
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uint2
-#define UGENTYPE uint2
-#define SGENTYPE int2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uint3
-#define UGENTYPE uint3
-#define SGENTYPE int3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uint4
-#define UGENTYPE uint4
-#define SGENTYPE int4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uint8
-#define UGENTYPE uint8
-#define SGENTYPE int8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE uint16
-#define UGENTYPE uint16
-#define SGENTYPE int16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef GENSIZE
-#define GENSIZE 64
-#undef SCALAR_GENTYPE
-#define SCALAR_GENTYPE long
-
-#define GENTYPE long
-#define UGENTYPE ulong
-#define SGENTYPE long
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE long2
-#define UGENTYPE ulong2
-#define SGENTYPE long2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE long3
-#define UGENTYPE ulong3
-#define SGENTYPE long3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE long4
-#define UGENTYPE ulong4
-#define SGENTYPE long4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE long8
-#define UGENTYPE ulong8
-#define SGENTYPE long8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE long16
-#define UGENTYPE ulong16
-#define SGENTYPE long16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef SCALAR_GENTYPE
-#define SCALAR_GENTYPE ulong
-
-#define GENTYPE ulong
-#define UGENTYPE ulong
-#define SGENTYPE long
-#define SCALAR 1
-#include BODY
-#undef SCALAR
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ulong2
-#define UGENTYPE ulong2
-#define SGENTYPE long2
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ulong3
-#define UGENTYPE ulong3
-#define SGENTYPE long3
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ulong4
-#define UGENTYPE ulong4
-#define SGENTYPE long4
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ulong8
-#define UGENTYPE ulong8
-#define SGENTYPE long8
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#define GENTYPE ulong16
-#define UGENTYPE ulong16
-#define SGENTYPE long16
-#include BODY
-#undef GENTYPE
-#undef UGENTYPE
-#undef SGENTYPE
-
-#undef GENSIZE
-#undef SCALAR_GENTYPE
-#undef BODY
+#define __CLC_GENSIZE 8
+#define __CLC_SCALAR_GENTYPE char
+
+#define __CLC_GENTYPE char
+#define __CLC_U_GENTYPE uchar
+#define __CLC_S_GENTYPE char
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE char2
+#define __CLC_U_GENTYPE uchar2
+#define __CLC_S_GENTYPE char2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE char3
+#define __CLC_U_GENTYPE uchar3
+#define __CLC_S_GENTYPE char3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE char4
+#define __CLC_U_GENTYPE uchar4
+#define __CLC_S_GENTYPE char4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE char8
+#define __CLC_U_GENTYPE uchar8
+#define __CLC_S_GENTYPE char8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE char16
+#define __CLC_U_GENTYPE uchar16
+#define __CLC_S_GENTYPE char16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE uchar
+
+#define __CLC_GENTYPE uchar
+#define __CLC_U_GENTYPE uchar
+#define __CLC_S_GENTYPE char
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uchar2
+#define __CLC_U_GENTYPE uchar2
+#define __CLC_S_GENTYPE char2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uchar3
+#define __CLC_U_GENTYPE uchar3
+#define __CLC_S_GENTYPE char3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uchar4
+#define __CLC_U_GENTYPE uchar4
+#define __CLC_S_GENTYPE char4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uchar8
+#define __CLC_U_GENTYPE uchar8
+#define __CLC_S_GENTYPE char8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uchar16
+#define __CLC_U_GENTYPE uchar16
+#define __CLC_S_GENTYPE char16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_GENSIZE
+#define __CLC_GENSIZE 16
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE short
+
+#define __CLC_GENTYPE short
+#define __CLC_U_GENTYPE ushort
+#define __CLC_S_GENTYPE short
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE short2
+#define __CLC_U_GENTYPE ushort2
+#define __CLC_S_GENTYPE short2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE short3
+#define __CLC_U_GENTYPE ushort3
+#define __CLC_S_GENTYPE short3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE short4
+#define __CLC_U_GENTYPE ushort4
+#define __CLC_S_GENTYPE short4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE short8
+#define __CLC_U_GENTYPE ushort8
+#define __CLC_S_GENTYPE short8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE short16
+#define __CLC_U_GENTYPE ushort16
+#define __CLC_S_GENTYPE short16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE ushort
+
+#define __CLC_GENTYPE ushort
+#define __CLC_U_GENTYPE ushort
+#define __CLC_S_GENTYPE short
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ushort2
+#define __CLC_U_GENTYPE ushort2
+#define __CLC_S_GENTYPE short2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ushort3
+#define __CLC_U_GENTYPE ushort3
+#define __CLC_S_GENTYPE short3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ushort4
+#define __CLC_U_GENTYPE ushort4
+#define __CLC_S_GENTYPE short4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ushort8
+#define __CLC_U_GENTYPE ushort8
+#define __CLC_S_GENTYPE short8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ushort16
+#define __CLC_U_GENTYPE ushort16
+#define __CLC_S_GENTYPE short16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_GENSIZE
+#define __CLC_GENSIZE 32
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE int
+
+#define __CLC_GENTYPE int
+#define __CLC_U_GENTYPE uint
+#define __CLC_S_GENTYPE int
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int2
+#define __CLC_U_GENTYPE uint2
+#define __CLC_S_GENTYPE int2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int3
+#define __CLC_U_GENTYPE uint3
+#define __CLC_S_GENTYPE int3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int4
+#define __CLC_U_GENTYPE uint4
+#define __CLC_S_GENTYPE int4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int8
+#define __CLC_U_GENTYPE uint8
+#define __CLC_S_GENTYPE int8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int16
+#define __CLC_U_GENTYPE uint16
+#define __CLC_S_GENTYPE int16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE uint
+
+#define __CLC_GENTYPE uint
+#define __CLC_U_GENTYPE uint
+#define __CLC_S_GENTYPE int
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint2
+#define __CLC_U_GENTYPE uint2
+#define __CLC_S_GENTYPE int2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint3
+#define __CLC_U_GENTYPE uint3
+#define __CLC_S_GENTYPE int3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint4
+#define __CLC_U_GENTYPE uint4
+#define __CLC_S_GENTYPE int4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint8
+#define __CLC_U_GENTYPE uint8
+#define __CLC_S_GENTYPE int8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint16
+#define __CLC_U_GENTYPE uint16
+#define __CLC_S_GENTYPE int16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_GENSIZE
+#define __CLC_GENSIZE 64
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE long
+
+#define __CLC_GENTYPE long
+#define __CLC_U_GENTYPE ulong
+#define __CLC_S_GENTYPE long
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE long2
+#define __CLC_U_GENTYPE ulong2
+#define __CLC_S_GENTYPE long2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE long3
+#define __CLC_U_GENTYPE ulong3
+#define __CLC_S_GENTYPE long3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE long4
+#define __CLC_U_GENTYPE ulong4
+#define __CLC_S_GENTYPE long4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE long8
+#define __CLC_U_GENTYPE ulong8
+#define __CLC_S_GENTYPE long8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE long16
+#define __CLC_U_GENTYPE ulong16
+#define __CLC_S_GENTYPE long16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE ulong
+
+#define __CLC_GENTYPE ulong
+#define __CLC_U_GENTYPE ulong
+#define __CLC_S_GENTYPE long
+#define __CLC_SCALAR 1
+#include __CLC_BODY
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ulong2
+#define __CLC_U_GENTYPE ulong2
+#define __CLC_S_GENTYPE long2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ulong3
+#define __CLC_U_GENTYPE ulong3
+#define __CLC_S_GENTYPE long3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ulong4
+#define __CLC_U_GENTYPE ulong4
+#define __CLC_S_GENTYPE long4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ulong8
+#define __CLC_U_GENTYPE ulong8
+#define __CLC_S_GENTYPE long8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE ulong16
+#define __CLC_U_GENTYPE ulong16
+#define __CLC_S_GENTYPE long16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_GENSIZE
+#undef __CLC_SCALAR_GENTYPE
+#undef __CLC_BODY
diff --git a/generic/include/clc/integer/rotate.h b/generic/include/clc/integer/rotate.h
index e163bc8..6320223 100644
--- a/generic/include/clc/integer/rotate.h
+++ b/generic/include/clc/integer/rotate.h
@@ -1,2 +1,2 @@
-#define BODY <clc/integer/rotate.inc>
+#define __CLC_BODY <clc/integer/rotate.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/rotate.inc b/generic/include/clc/integer/rotate.inc
index 5720e1c..c97711e 100644
--- a/generic/include/clc/integer/rotate.inc
+++ b/generic/include/clc/integer/rotate.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE rotate(GENTYPE x, GENTYPE y);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE rotate(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/generic/include/clc/integer/sub_sat.h b/generic/include/clc/integer/sub_sat.h
index 942274d..f841529 100644
--- a/generic/include/clc/integer/sub_sat.h
+++ b/generic/include/clc/integer/sub_sat.h
@@ -1,2 +1,2 @@
-#define BODY <clc/integer/sub_sat.inc>
+#define __CLC_BODY <clc/integer/sub_sat.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/include/clc/integer/sub_sat.inc b/generic/include/clc/integer/sub_sat.inc
index 3e0f8f9..425df2e 100644
--- a/generic/include/clc/integer/sub_sat.inc
+++ b/generic/include/clc/integer/sub_sat.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE sub_sat(GENTYPE x, GENTYPE y);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sub_sat(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/generic/include/clc/math/binary_decl.inc b/generic/include/clc/math/binary_decl.inc
index 1a49e26..70a7114 100644
--- a/generic/include/clc/math/binary_decl.inc
+++ b/generic/include/clc/math/binary_decl.inc
@@ -1,6 +1,6 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE FUNCTION(GENTYPE a, GENTYPE b);
-_CLC_OVERLOAD _CLC_DECL GENTYPE FUNCTION(GENTYPE a, float b);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, float b);
 
 #ifdef cl_khr_fp64
-_CLC_OVERLOAD _CLC_DECL GENTYPE FUNCTION(GENTYPE a, double b);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, double b);
 #endif
diff --git a/generic/include/clc/math/binary_intrin.inc b/generic/include/clc/math/binary_intrin.inc
index 30dec7b..cfbe741 100644
--- a/generic/include/clc/math/binary_intrin.inc
+++ b/generic/include/clc/math/binary_intrin.inc
@@ -1,18 +1,18 @@
-_CLC_OVERLOAD float FUNCTION(float, float) __asm(INTRINSIC ".f32");
-_CLC_OVERLOAD float2 FUNCTION(float2, float2) __asm(INTRINSIC ".v2f32");
-_CLC_OVERLOAD float3 FUNCTION(float3, float3) __asm(INTRINSIC ".v3f32");
-_CLC_OVERLOAD float4 FUNCTION(float4, float4) __asm(INTRINSIC ".v4f32");
-_CLC_OVERLOAD float8 FUNCTION(float8, float8) __asm(INTRINSIC ".v8f32");
-_CLC_OVERLOAD float16 FUNCTION(float16, float16) __asm(INTRINSIC ".v16f32");
+_CLC_OVERLOAD float __CLC_FUNCTION(float, float) __asm(__CLC_INTRINSIC ".f32");
+_CLC_OVERLOAD float2 __CLC_FUNCTION(float2, float2) __asm(__CLC_INTRINSIC ".v2f32");
+_CLC_OVERLOAD float3 __CLC_FUNCTION(float3, float3) __asm(__CLC_INTRINSIC ".v3f32");
+_CLC_OVERLOAD float4 __CLC_FUNCTION(float4, float4) __asm(__CLC_INTRINSIC ".v4f32");
+_CLC_OVERLOAD float8 __CLC_FUNCTION(float8, float8) __asm(__CLC_INTRINSIC ".v8f32");
+_CLC_OVERLOAD float16 __CLC_FUNCTION(float16, float16) __asm(__CLC_INTRINSIC ".v16f32");
 
 #ifdef cl_khr_fp64
-_CLC_OVERLOAD double FUNCTION(double, double) __asm(INTRINSIC ".f64");
-_CLC_OVERLOAD double2 FUNCTION(double2, double2) __asm(INTRINSIC ".v2f64");
-_CLC_OVERLOAD double3 FUNCTION(double3, double3) __asm(INTRINSIC ".v3f64");
-_CLC_OVERLOAD double4 FUNCTION(double4, double4) __asm(INTRINSIC ".v4f64");
-_CLC_OVERLOAD double8 FUNCTION(double8, double8) __asm(INTRINSIC ".v8f64");
-_CLC_OVERLOAD double16 FUNCTION(double16, double16) __asm(INTRINSIC ".v16f64");
+_CLC_OVERLOAD double __CLC_FUNCTION(double, double) __asm(__CLC_INTRINSIC ".f64");
+_CLC_OVERLOAD double2 __CLC_FUNCTION(double2, double2) __asm(__CLC_INTRINSIC ".v2f64");
+_CLC_OVERLOAD double3 __CLC_FUNCTION(double3, double3) __asm(__CLC_INTRINSIC ".v3f64");
+_CLC_OVERLOAD double4 __CLC_FUNCTION(double4, double4) __asm(__CLC_INTRINSIC ".v4f64");
+_CLC_OVERLOAD double8 __CLC_FUNCTION(double8, double8) __asm(__CLC_INTRINSIC ".v8f64");
+_CLC_OVERLOAD double16 __CLC_FUNCTION(double16, double16) __asm(__CLC_INTRINSIC ".v16f64");
 #endif
 
-#undef FUNCTION
-#undef INTRINSIC
+#undef __CLC_FUNCTION
+#undef __CLC_INTRINSIC
diff --git a/generic/include/clc/math/ceil.h b/generic/include/clc/math/ceil.h
index b8e4b08..5b40abf 100644
--- a/generic/include/clc/math/ceil.h
+++ b/generic/include/clc/math/ceil.h
@@ -1,6 +1,6 @@
 #undef ceil
 #define ceil __clc_ceil
 
-#define FUNCTION __clc_ceil
-#define INTRINSIC "llvm.ceil"
+#define __CLC_FUNCTION __clc_ceil
+#define __CLC_INTRINSIC "llvm.ceil"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/cos.h b/generic/include/clc/math/cos.h
index e876c1a..974f9d1 100644
--- a/generic/include/clc/math/cos.h
+++ b/generic/include/clc/math/cos.h
@@ -1,6 +1,6 @@
 #undef cos
 #define cos __clc_cos
 
-#define FUNCTION __clc_cos
-#define INTRINSIC "llvm.cos"
+#define __CLC_FUNCTION __clc_cos
+#define __CLC_INTRINSIC "llvm.cos"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/exp2.h b/generic/include/clc/math/exp2.h
index fe91633..ec0dad2 100644
--- a/generic/include/clc/math/exp2.h
+++ b/generic/include/clc/math/exp2.h
@@ -1,6 +1,6 @@
 #undef exp2
 #define exp2 __clc_exp2
 
-#define FUNCTION __clc_exp2
-#define INTRINSIC "llvm.exp2"
+#define __CLC_FUNCTION __clc_exp2
+#define __CLC_INTRINSIC "llvm.exp2"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/fabs.h b/generic/include/clc/math/fabs.h
index 4da013c..ee2f893 100644
--- a/generic/include/clc/math/fabs.h
+++ b/generic/include/clc/math/fabs.h
@@ -1,6 +1,6 @@
 #undef fabs
 #define fabs __clc_fabs
 
-#define FUNCTION __clc_fabs
-#define INTRINSIC "llvm.fabs"
+#define __CLC_FUNCTION __clc_fabs
+#define __CLC_INTRINSIC "llvm.fabs"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/floor.h b/generic/include/clc/math/floor.h
index abb7c2a..2337d35 100644
--- a/generic/include/clc/math/floor.h
+++ b/generic/include/clc/math/floor.h
@@ -1,6 +1,6 @@
 #undef floor
 #define floor __clc_floor
 
-#define FUNCTION __clc_floor
-#define INTRINSIC "llvm.floor"
+#define __CLC_FUNCTION __clc_floor
+#define __CLC_INTRINSIC "llvm.floor"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/fma.h b/generic/include/clc/math/fma.h
index 8d862fa..02d39f6 100644
--- a/generic/include/clc/math/fma.h
+++ b/generic/include/clc/math/fma.h
@@ -1,6 +1,6 @@
 #undef fma
 #define fma __clc_fma
 
-#define FUNCTION __clc_fma
-#define INTRINSIC "llvm.fma"
+#define __CLC_FUNCTION __clc_fma
+#define __CLC_INTRINSIC "llvm.fma"
 #include <clc/math/ternary_intrin.inc>
diff --git a/generic/include/clc/math/fmax.h b/generic/include/clc/math/fmax.h
index d26e5d6..d6956af 100644
--- a/generic/include/clc/math/fmax.h
+++ b/generic/include/clc/math/fmax.h
@@ -1,11 +1,11 @@
 #undef fmax
 #define fmax __clc_fmax
 
-#define BODY <clc/math/binary_decl.inc>
-#define FUNCTION __clc_fmax
+#define __CLC_BODY <clc/math/binary_decl.inc>
+#define __CLC_FUNCTION __clc_fmax
 
 #include <clc/math/gentype.inc>
 
-#undef BODY
-#undef FUNCTION
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
 
diff --git a/generic/include/clc/math/fmin.h b/generic/include/clc/math/fmin.h
index 3506aef..5588ba9 100644
--- a/generic/include/clc/math/fmin.h
+++ b/generic/include/clc/math/fmin.h
@@ -1,11 +1,11 @@
 #undef fmin
 #define fmin __clc_fmin
 
-#define BODY <clc/math/binary_decl.inc>
-#define FUNCTION __clc_fmin
+#define __CLC_BODY <clc/math/binary_decl.inc>
+#define __CLC_FUNCTION __clc_fmin
 
 #include <clc/math/gentype.inc>
 
-#undef BODY
-#undef FUNCTION
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
 
diff --git a/generic/include/clc/math/gentype.inc b/generic/include/clc/math/gentype.inc
index 4ed2151..bff4f56 100644
--- a/generic/include/clc/math/gentype.inc
+++ b/generic/include/clc/math/gentype.inc
@@ -1,63 +1,63 @@
-#define SCALAR_GENTYPE float
+#define __CLC_SCALAR_GENTYPE float
 
-#define GENTYPE float
-#define SCALAR
-#include BODY
-#undef GENTYPE
-#undef SCALAR
+#define __CLC_GENTYPE float
+#define __CLC_SCALAR
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_SCALAR
 
-#define GENTYPE float2
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE float2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE float3
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE float3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE float4
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE float4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE float8
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE float8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE float16
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE float16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#undef SCALAR_GENTYPE
+#undef __CLC_SCALAR_GENTYPE
 
 #ifdef cl_khr_fp64
-#define SCALAR_GENTYPE double
+#define __CLC_SCALAR_GENTYPE double
 
-#define SCALAR
-#define GENTYPE double
-#include BODY
-#undef GENTYPE
-#undef SCALAR
+#define __CLC_SCALAR
+#define __CLC_GENTYPE double
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+#undef __CLC_SCALAR
 
-#define GENTYPE double2
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE double2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE double3
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE double3
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE double4
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE double4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE double8
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE double8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#define GENTYPE double16
-#include BODY
-#undef GENTYPE
+#define __CLC_GENTYPE double16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
 
-#undef SCALAR_GENTYPE
+#undef __CLC_SCALAR_GENTYPE
 #endif
 
-#undef BODY
+#undef __CLC_BODY
diff --git a/generic/include/clc/math/hypot.h b/generic/include/clc/math/hypot.h
index 9ffda48..c00eb45 100644
--- a/generic/include/clc/math/hypot.h
+++ b/generic/include/clc/math/hypot.h
@@ -1,2 +1,2 @@
-#define BODY <clc/math/hypot.inc>
+#define __CLC_BODY <clc/math/hypot.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/include/clc/math/hypot.inc b/generic/include/clc/math/hypot.inc
index 2f97ee5..08b4605 100644
--- a/generic/include/clc/math/hypot.inc
+++ b/generic/include/clc/math/hypot.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE hypot(GENTYPE x, GENTYPE y);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE hypot(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/generic/include/clc/math/log2.h b/generic/include/clc/math/log2.h
index d8a8842..8801240 100644
--- a/generic/include/clc/math/log2.h
+++ b/generic/include/clc/math/log2.h
@@ -1,6 +1,6 @@
 #undef log2
 #define log2 __clc_log2
 
-#define FUNCTION __clc_log2
-#define INTRINSIC "llvm.log2"
+#define __CLC_FUNCTION __clc_log2
+#define __CLC_INTRINSIC "llvm.log2"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/mad.h b/generic/include/clc/math/mad.h
index 58f7674..c4e5084 100644
--- a/generic/include/clc/math/mad.h
+++ b/generic/include/clc/math/mad.h
@@ -1,2 +1,2 @@
-#define BODY <clc/math/mad.inc>
+#define __CLC_BODY <clc/math/mad.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/include/clc/math/mad.inc b/generic/include/clc/math/mad.inc
index 5200d67..61194b6 100644
--- a/generic/include/clc/math/mad.inc
+++ b/generic/include/clc/math/mad.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE mad(GENTYPE a, GENTYPE b, GENTYPE c);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c);
diff --git a/generic/include/clc/math/pow.h b/generic/include/clc/math/pow.h
index 208d06d..320d341 100644
--- a/generic/include/clc/math/pow.h
+++ b/generic/include/clc/math/pow.h
@@ -1,6 +1,6 @@
 #undef pow
 #define pow __clc_pow
 
-#define FUNCTION __clc_pow
-#define INTRINSIC "llvm.pow"
+#define __CLC_FUNCTION __clc_pow
+#define __CLC_INTRINSIC "llvm.pow"
 #include <clc/math/binary_intrin.inc>
diff --git a/generic/include/clc/math/sin.h b/generic/include/clc/math/sin.h
index 2216804..e50b71a 100644
--- a/generic/include/clc/math/sin.h
+++ b/generic/include/clc/math/sin.h
@@ -1,6 +1,6 @@
 #undef sin
 #define sin __clc_sin
 
-#define FUNCTION __clc_sin
-#define INTRINSIC "llvm.sin"
+#define __CLC_FUNCTION __clc_sin
+#define __CLC_INTRINSIC "llvm.sin"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/sqrt.h b/generic/include/clc/math/sqrt.h
index a000e24..f69de84 100644
--- a/generic/include/clc/math/sqrt.h
+++ b/generic/include/clc/math/sqrt.h
@@ -1,6 +1,6 @@
 #undef sqrt
 #define sqrt __clc_sqrt
 
-#define FUNCTION __clc_sqrt
-#define INTRINSIC "llvm.sqrt"
+#define __CLC_FUNCTION __clc_sqrt
+#define __CLC_INTRINSIC "llvm.sqrt"
 #include <clc/math/unary_intrin.inc>
diff --git a/generic/include/clc/math/ternary_intrin.inc b/generic/include/clc/math/ternary_intrin.inc
index 7d451e9..9633696 100644
--- a/generic/include/clc/math/ternary_intrin.inc
+++ b/generic/include/clc/math/ternary_intrin.inc
@@ -1,18 +1,18 @@
-_CLC_OVERLOAD float FUNCTION(float, float, float) __asm(INTRINSIC ".f32");
-_CLC_OVERLOAD float2 FUNCTION(float2, float2, float2) __asm(INTRINSIC ".v2f32");
-_CLC_OVERLOAD float3 FUNCTION(float3, float3, float3) __asm(INTRINSIC ".v3f32");
-_CLC_OVERLOAD float4 FUNCTION(float4, float4, float4) __asm(INTRINSIC ".v4f32");
-_CLC_OVERLOAD float8 FUNCTION(float8, float8, float8) __asm(INTRINSIC ".v8f32");
-_CLC_OVERLOAD float16 FUNCTION(float16, float16, float16) __asm(INTRINSIC ".v16f32");
+_CLC_OVERLOAD float __CLC_FUNCTION(float, float, float) __asm(__CLC_INTRINSIC ".f32");
+_CLC_OVERLOAD float2 __CLC_FUNCTION(float2, float2, float2) __asm(__CLC_INTRINSIC ".v2f32");
+_CLC_OVERLOAD float3 __CLC_FUNCTION(float3, float3, float3) __asm(__CLC_INTRINSIC ".v3f32");
+_CLC_OVERLOAD float4 __CLC_FUNCTION(float4, float4, float4) __asm(__CLC_INTRINSIC ".v4f32");
+_CLC_OVERLOAD float8 __CLC_FUNCTION(float8, float8, float8) __asm(__CLC_INTRINSIC ".v8f32");
+_CLC_OVERLOAD float16 __CLC_FUNCTION(float16, float16, float16) __asm(__CLC_INTRINSIC ".v16f32");
 
 #ifdef cl_khr_fp64
-_CLC_OVERLOAD double FUNCTION(double, double, double) __asm(INTRINSIC ".f64");
-_CLC_OVERLOAD double2 FUNCTION(double2, double2, double2) __asm(INTRINSIC ".v2f64");
-_CLC_OVERLOAD double3 FUNCTION(double3, double3, double3) __asm(INTRINSIC ".v3f64");
-_CLC_OVERLOAD double4 FUNCTION(double4, double4, double4) __asm(INTRINSIC ".v4f64");
-_CLC_OVERLOAD double8 FUNCTION(double8, double8, double8) __asm(INTRINSIC ".v8f64");
-_CLC_OVERLOAD double16 FUNCTION(double16, double16, double16) __asm(INTRINSIC ".v16f64");
+_CLC_OVERLOAD double __CLC_FUNCTION(double, double, double) __asm(__CLC_INTRINSIC ".f64");
+_CLC_OVERLOAD double2 __CLC_FUNCTION(double2, double2, double2) __asm(__CLC_INTRINSIC ".v2f64");
+_CLC_OVERLOAD double3 __CLC_FUNCTION(double3, double3, double3) __asm(__CLC_INTRINSIC ".v3f64");
+_CLC_OVERLOAD double4 __CLC_FUNCTION(double4, double4, double4) __asm(__CLC_INTRINSIC ".v4f64");
+_CLC_OVERLOAD double8 __CLC_FUNCTION(double8, double8, double8) __asm(__CLC_INTRINSIC ".v8f64");
+_CLC_OVERLOAD double16 __CLC_FUNCTION(double16, double16, double16) __asm(__CLC_INTRINSIC ".v16f64");
 #endif
 
-#undef FUNCTION
-#undef INTRINSIC
+#undef __CLC_FUNCTION
+#undef __CLC_INTRINSIC
diff --git a/generic/include/clc/math/unary_decl.inc b/generic/include/clc/math/unary_decl.inc
index 392c4d6..9858d90 100644
--- a/generic/include/clc/math/unary_decl.inc
+++ b/generic/include/clc/math/unary_decl.inc
@@ -1 +1 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE FUNCTION(GENTYPE x);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x);
diff --git a/generic/include/clc/math/unary_intrin.inc b/generic/include/clc/math/unary_intrin.inc
index 2da5a9c..8c62d88 100644
--- a/generic/include/clc/math/unary_intrin.inc
+++ b/generic/include/clc/math/unary_intrin.inc
@@ -1,18 +1,18 @@
-_CLC_OVERLOAD float FUNCTION(float f) __asm(INTRINSIC ".f32");
-_CLC_OVERLOAD float2 FUNCTION(float2 f) __asm(INTRINSIC ".v2f32");
-_CLC_OVERLOAD float3 FUNCTION(float3 f) __asm(INTRINSIC ".v3f32");
-_CLC_OVERLOAD float4 FUNCTION(float4 f) __asm(INTRINSIC ".v4f32");
-_CLC_OVERLOAD float8 FUNCTION(float8 f) __asm(INTRINSIC ".v8f32");
-_CLC_OVERLOAD float16 FUNCTION(float16 f) __asm(INTRINSIC ".v16f32");
+_CLC_OVERLOAD float __CLC_FUNCTION(float f) __asm(__CLC_INTRINSIC ".f32");
+_CLC_OVERLOAD float2 __CLC_FUNCTION(float2 f) __asm(__CLC_INTRINSIC ".v2f32");
+_CLC_OVERLOAD float3 __CLC_FUNCTION(float3 f) __asm(__CLC_INTRINSIC ".v3f32");
+_CLC_OVERLOAD float4 __CLC_FUNCTION(float4 f) __asm(__CLC_INTRINSIC ".v4f32");
+_CLC_OVERLOAD float8 __CLC_FUNCTION(float8 f) __asm(__CLC_INTRINSIC ".v8f32");
+_CLC_OVERLOAD float16 __CLC_FUNCTION(float16 f) __asm(__CLC_INTRINSIC ".v16f32");
 
 #ifdef cl_khr_fp64
-_CLC_OVERLOAD double FUNCTION(double d) __asm(INTRINSIC ".f64");
-_CLC_OVERLOAD double2 FUNCTION(double2 d) __asm(INTRINSIC ".v2f64");
-_CLC_OVERLOAD double3 FUNCTION(double3 d) __asm(INTRINSIC ".v3f64");
-_CLC_OVERLOAD double4 FUNCTION(double4 d) __asm(INTRINSIC ".v4f64");
-_CLC_OVERLOAD double8 FUNCTION(double8 d) __asm(INTRINSIC ".v8f64");
-_CLC_OVERLOAD double16 FUNCTION(double16 d) __asm(INTRINSIC ".v16f64");
+_CLC_OVERLOAD double __CLC_FUNCTION(double d) __asm(__CLC_INTRINSIC ".f64");
+_CLC_OVERLOAD double2 __CLC_FUNCTION(double2 d) __asm(__CLC_INTRINSIC ".v2f64");
+_CLC_OVERLOAD double3 __CLC_FUNCTION(double3 d) __asm(__CLC_INTRINSIC ".v3f64");
+_CLC_OVERLOAD double4 __CLC_FUNCTION(double4 d) __asm(__CLC_INTRINSIC ".v4f64");
+_CLC_OVERLOAD double8 __CLC_FUNCTION(double8 d) __asm(__CLC_INTRINSIC ".v8f64");
+_CLC_OVERLOAD double16 __CLC_FUNCTION(double16 d) __asm(__CLC_INTRINSIC ".v16f64");
 #endif
 
-#undef FUNCTION
-#undef INTRINSIC
+#undef __CLC_FUNCTION
+#undef __CLC_INTRINSIC
diff --git a/generic/include/clc/shared/clamp.h b/generic/include/clc/shared/clamp.h
index 5c2ebd0..a389b85 100644
--- a/generic/include/clc/shared/clamp.h
+++ b/generic/include/clc/shared/clamp.h
@@ -1,5 +1,5 @@
-#define BODY <clc/shared/clamp.inc>
+#define __CLC_BODY <clc/shared/clamp.inc>
 #include <clc/integer/gentype.inc>
 
-#define BODY <clc/shared/clamp.inc>
+#define __CLC_BODY <clc/shared/clamp.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/include/clc/shared/clamp.inc b/generic/include/clc/shared/clamp.inc
index 67c8142..aaff9d0 100644
--- a/generic/include/clc/shared/clamp.inc
+++ b/generic/include/clc/shared/clamp.inc
@@ -1,5 +1,5 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE clamp(GENTYPE x, GENTYPE y, GENTYPE z);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z);
 
-#ifndef SCALAR
-_CLC_OVERLOAD _CLC_DECL GENTYPE clamp(GENTYPE x, SCALAR_GENTYPE y, SCALAR_GENTYPE z);
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_SCALAR_GENTYPE y, __CLC_SCALAR_GENTYPE z);
 #endif
diff --git a/generic/include/clc/shared/max.h b/generic/include/clc/shared/max.h
index 7967d4a..ee20b9e 100644
--- a/generic/include/clc/shared/max.h
+++ b/generic/include/clc/shared/max.h
@@ -1,5 +1,5 @@
-#define BODY <clc/shared/max.inc>
+#define __CLC_BODY <clc/shared/max.inc>
 #include <clc/integer/gentype.inc>
 
-#define BODY <clc/shared/max.inc>
+#define __CLC_BODY <clc/shared/max.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/include/clc/shared/max.inc b/generic/include/clc/shared/max.inc
index 9fe73c4..5901074 100644
--- a/generic/include/clc/shared/max.inc
+++ b/generic/include/clc/shared/max.inc
@@ -1,5 +1,5 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, GENTYPE b);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b);
 
-#ifndef SCALAR
-_CLC_OVERLOAD _CLC_DECL GENTYPE max(GENTYPE a, SCALAR_GENTYPE b);
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b);
 #endif
diff --git a/generic/include/clc/shared/min.h b/generic/include/clc/shared/min.h
index e16b45d..e11d9f9 100644
--- a/generic/include/clc/shared/min.h
+++ b/generic/include/clc/shared/min.h
@@ -1,5 +1,5 @@
-#define BODY <clc/shared/min.inc>
+#define __CLC_BODY <clc/shared/min.inc>
 #include <clc/integer/gentype.inc>
 
-#define BODY <clc/shared/min.inc>
+#define __CLC_BODY <clc/shared/min.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/include/clc/shared/min.inc b/generic/include/clc/shared/min.inc
index cf3afaf..d8c1568 100644
--- a/generic/include/clc/shared/min.inc
+++ b/generic/include/clc/shared/min.inc
@@ -1,5 +1,5 @@
-_CLC_OVERLOAD _CLC_DECL GENTYPE min(GENTYPE a, GENTYPE b);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b);
 
-#ifndef SCALAR
-_CLC_OVERLOAD _CLC_DECL GENTYPE min(GENTYPE a, SCALAR_GENTYPE b);
-#endif
\ No newline at end of file
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b);
+#endif
diff --git a/generic/lib/geometric/length.cl b/generic/lib/geometric/length.cl
index e26f2b8..ef087c7 100644
--- a/generic/lib/geometric/length.cl
+++ b/generic/lib/geometric/length.cl
@@ -4,5 +4,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY <length.inc>
+#define __CLC_BODY <length.inc>
 #include <clc/geometric/floatn.inc>
diff --git a/generic/lib/geometric/length.inc b/generic/lib/geometric/length.inc
index 494789c..5faaaff 100644
--- a/generic/lib/geometric/length.inc
+++ b/generic/lib/geometric/length.inc
@@ -1,3 +1,3 @@
-_CLC_OVERLOAD _CLC_DEF FLOAT length(FLOATN p) {
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOAT length(__CLC_FLOATN p) {
   return native_sqrt(dot(p, p));
 }
diff --git a/generic/lib/geometric/normalize.cl b/generic/lib/geometric/normalize.cl
index 70d255d..b06b2fe 100644
--- a/generic/lib/geometric/normalize.cl
+++ b/generic/lib/geometric/normalize.cl
@@ -4,5 +4,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY <normalize.inc>
+#define __CLC_BODY <normalize.inc>
 #include <clc/geometric/floatn.inc>
diff --git a/generic/lib/geometric/normalize.inc b/generic/lib/geometric/normalize.inc
index a23908b..423ff79 100644
--- a/generic/lib/geometric/normalize.inc
+++ b/generic/lib/geometric/normalize.inc
@@ -1,3 +1,3 @@
-_CLC_OVERLOAD _CLC_DEF FLOATN normalize(FLOATN p) {
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN normalize(__CLC_FLOATN p) {
   return p/length(p);
 }
diff --git a/generic/lib/integer/abs.cl b/generic/lib/integer/abs.cl
index 86f1a34..faff8d0 100644
--- a/generic/lib/integer/abs.cl
+++ b/generic/lib/integer/abs.cl
@@ -1,4 +1,4 @@
 #include <clc/clc.h>
 
-#define BODY <abs.inc>
+#define __CLC_BODY <abs.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/lib/integer/abs.inc b/generic/lib/integer/abs.inc
index fff6691..cfe7bfe 100644
--- a/generic/lib/integer/abs.inc
+++ b/generic/lib/integer/abs.inc
@@ -1,3 +1,3 @@
-_CLC_OVERLOAD _CLC_DEF UGENTYPE abs(GENTYPE x) {
-  return __builtin_astype((GENTYPE)(x > (GENTYPE)(0) ? x : -x), UGENTYPE);
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs(__CLC_GENTYPE x) {
+  return __builtin_astype((__CLC_GENTYPE)(x > (__CLC_GENTYPE)(0) ? x : -x), __CLC_U_GENTYPE);
 }
diff --git a/generic/lib/integer/abs_diff.cl b/generic/lib/integer/abs_diff.cl
index c9ca821..3d75105 100644
--- a/generic/lib/integer/abs_diff.cl
+++ b/generic/lib/integer/abs_diff.cl
@@ -1,4 +1,4 @@
 #include <clc/clc.h>
 
-#define BODY <abs_diff.inc>
+#define __CLC_BODY <abs_diff.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/lib/integer/abs_diff.inc b/generic/lib/integer/abs_diff.inc
index 6ad57ee..f39c3ff 100644
--- a/generic/lib/integer/abs_diff.inc
+++ b/generic/lib/integer/abs_diff.inc
@@ -1,3 +1,3 @@
-_CLC_OVERLOAD _CLC_DEF UGENTYPE abs_diff(GENTYPE x, GENTYPE y) {
-  return __builtin_astype((GENTYPE)(x > y ? x-y : y-x), UGENTYPE);
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE abs_diff(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  return __builtin_astype((__CLC_GENTYPE)(x > y ? x-y : y-x), __CLC_U_GENTYPE);
 }
diff --git a/generic/lib/integer/rotate.cl b/generic/lib/integer/rotate.cl
index d7eff2b..27ce515 100644
--- a/generic/lib/integer/rotate.cl
+++ b/generic/lib/integer/rotate.cl
@@ -1,4 +1,4 @@
 #include <clc/clc.h>
 
-#define BODY <rotate.inc>
+#define __CLC_BODY <rotate.inc>
 #include <clc/integer/gentype.inc>
diff --git a/generic/lib/integer/rotate.inc b/generic/lib/integer/rotate.inc
index 2aa6cc9..33bb0a8 100644
--- a/generic/lib/integer/rotate.inc
+++ b/generic/lib/integer/rotate.inc
@@ -7,36 +7,36 @@
  * Eventually, someone should feel free to implement an llvm-specific version
  */
 
-_CLC_OVERLOAD _CLC_DEF GENTYPE rotate(GENTYPE x, GENTYPE n){
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rotate(__CLC_GENTYPE x, __CLC_GENTYPE n){
     //Try to avoid extra work if someone's spinning the value through multiple
     //full rotations
-    n = n % (GENTYPE)GENSIZE;
+    n = n % (__CLC_GENTYPE)__CLC_GENSIZE;
 
-#ifdef SCALAR
+#ifdef __CLC_SCALAR
     if (n > 0){
-        return (x << n) | (((UGENTYPE)x) >> (GENSIZE - n));
+        return (x << n) | (((__CLC_U_GENTYPE)x) >> (__CLC_GENSIZE - n));
     } else if (n == 0){
         return x;
     } else {
-        return ( (((UGENTYPE)x) >> -n) | (x << (GENSIZE + n)) );
+        return ( (((__CLC_U_GENTYPE)x) >> -n) | (x << (__CLC_GENSIZE + n)) );
     }
 #else
     //XXX: There's a lot of __builtin_astype calls to cast everything to
-    //     unsigned ... This should be improved so that if GENTYPE==UGENTYPE, no
+    //     unsigned ... This should be improved so that if __CLC_GENTYPE==__CLC_U_GENTYPE, no
     //     casts are required.
     
-    UGENTYPE x_1 = __builtin_astype(x, UGENTYPE);
+    __CLC_U_GENTYPE x_1 = __builtin_astype(x, __CLC_U_GENTYPE);
 
-    //XXX: Is (UGENTYPE >> SGENTYPE) | (UGENTYPE << SGENTYPE) legal?
+    //XXX: Is (__CLC_U_GENTYPE >> S__CLC_GENTYPE) | (__CLC_U_GENTYPE << S__CLC_GENTYPE) legal?
     //     If so, then combine the amt and shifts into a single set of statements
     
-    UGENTYPE amt;
-    amt = (n < (GENTYPE)0 ? __builtin_astype((GENTYPE)0-n, UGENTYPE) : (UGENTYPE)0);
-    x_1 = (x_1 >> amt) | (x_1 << ((UGENTYPE)GENSIZE - amt));
+    __CLC_U_GENTYPE amt;
+    amt = (n < (__CLC_GENTYPE)0 ? __builtin_astype((__CLC_GENTYPE)0-n, __CLC_U_GENTYPE) : (__CLC_U_GENTYPE)0);
+    x_1 = (x_1 >> amt) | (x_1 << ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt));
 
-    amt = (n < (GENTYPE)0 ? (UGENTYPE)0 : __builtin_astype(n, UGENTYPE));
-    x_1 = (x_1 << amt) | (x_1 >> ((UGENTYPE)GENSIZE - amt));
+    amt = (n < (__CLC_GENTYPE)0 ? (__CLC_U_GENTYPE)0 : __builtin_astype(n, __CLC_U_GENTYPE));
+    x_1 = (x_1 << amt) | (x_1 >> ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt));
 
-    return __builtin_astype(x_1, GENTYPE);
+    return __builtin_astype(x_1, __CLC_GENTYPE);
 #endif
-}
\ No newline at end of file
+}
diff --git a/generic/lib/math/binary_impl.inc b/generic/lib/math/binary_impl.inc
index e4b1e5f..83872d2 100644
--- a/generic/lib/math/binary_impl.inc
+++ b/generic/lib/math/binary_impl.inc
@@ -1,18 +1,18 @@
 
-#ifndef SCALAR
+#ifndef __CLC_SCALAR
 
-_CLC_OVERLOAD _CLC_DEF GENTYPE FUNCTION(GENTYPE x, GENTYPE y) {
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return FUNCTION_IMPL(x, y);
 }
 
 #endif
 
-_CLC_OVERLOAD _CLC_DEF GENTYPE FUNCTION(GENTYPE x, double y) {
-  GENTYPE vec_y = (GENTYPE) (y);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, double y) {
+  __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y);
   return FUNCTION_IMPL(x, vec_y);
 }
 
-_CLC_OVERLOAD _CLC_DEF GENTYPE FUNCTION(GENTYPE x, float y) {
-  GENTYPE vec_y = (GENTYPE) (y);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, float y) {
+  __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y);
   return FUNCTION_IMPL(x, vec_y);
 }
diff --git a/generic/lib/math/fmax.cl b/generic/lib/math/fmax.cl
index 68a67ac..58583d6 100644
--- a/generic/lib/math/fmax.cl
+++ b/generic/lib/math/fmax.cl
@@ -7,5 +7,5 @@
 #define FUNCTION __clc_fmax
 #define FUNCTION_IMPL(x, y) ((x) < (y) ? (y) : (x))
 
-#define BODY <binary_impl.inc>
+#define __CLC_BODY <binary_impl.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/lib/math/fmin.cl b/generic/lib/math/fmin.cl
index cac188e..a61ad47 100644
--- a/generic/lib/math/fmin.cl
+++ b/generic/lib/math/fmin.cl
@@ -7,5 +7,5 @@
 #define FUNCTION __clc_fmin
 #define FUNCTION_IMPL(x, y) ((y) < (x) ? (y) : (x))
 
-#define BODY <binary_impl.inc>
+#define __CLC_BODY <binary_impl.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/lib/math/hypot.cl b/generic/lib/math/hypot.cl
index dcdc1ed..eca042c 100644
--- a/generic/lib/math/hypot.cl
+++ b/generic/lib/math/hypot.cl
@@ -4,5 +4,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY <hypot.inc>
+#define __CLC_BODY <hypot.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/lib/math/hypot.inc b/generic/lib/math/hypot.inc
index 3f529c8..036cee7 100644
--- a/generic/lib/math/hypot.inc
+++ b/generic/lib/math/hypot.inc
@@ -1,3 +1,3 @@
-_CLC_OVERLOAD _CLC_DEF GENTYPE hypot(GENTYPE x, GENTYPE y) {
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE hypot(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return sqrt(x*x + y*y);
 }
diff --git a/generic/lib/math/mad.cl b/generic/lib/math/mad.cl
index e66e204..6c7b90d 100644
--- a/generic/lib/math/mad.cl
+++ b/generic/lib/math/mad.cl
@@ -4,5 +4,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY <mad.inc>
+#define __CLC_BODY <mad.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/lib/math/mad.inc b/generic/lib/math/mad.inc
index aec9c06..d32c783 100644
--- a/generic/lib/math/mad.inc
+++ b/generic/lib/math/mad.inc
@@ -1,3 +1,3 @@
-_CLC_OVERLOAD _CLC_DEF GENTYPE mad(GENTYPE a, GENTYPE b, GENTYPE c) {
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
   return a * b + c;
 }
diff --git a/generic/lib/shared/clamp.cl b/generic/lib/shared/clamp.cl
index 0e8d223..c79a358 100644
--- a/generic/lib/shared/clamp.cl
+++ b/generic/lib/shared/clamp.cl
@@ -1,11 +1,11 @@
 #include <clc/clc.h>
 
-#define BODY <clamp.inc>
+#define __CLC_BODY <clamp.inc>
 #include <clc/integer/gentype.inc>
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY <clamp.inc>
+#define __CLC_BODY <clamp.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/lib/shared/clamp.inc b/generic/lib/shared/clamp.inc
index 58370d3..c918f9c 100644
--- a/generic/lib/shared/clamp.inc
+++ b/generic/lib/shared/clamp.inc
@@ -1,9 +1,9 @@
-_CLC_OVERLOAD _CLC_DEF GENTYPE clamp(GENTYPE x, GENTYPE y, GENTYPE z) {
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) {
   return (x > z ? z : (x < y ? y : x));
 }
 
-#ifndef SCALAR
-_CLC_OVERLOAD _CLC_DEF GENTYPE clamp(GENTYPE x, SCALAR_GENTYPE y, SCALAR_GENTYPE z) {
-  return (x > (GENTYPE)z ? (GENTYPE)z : (x < (GENTYPE)y ? (GENTYPE)y : x));
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_SCALAR_GENTYPE y, __CLC_SCALAR_GENTYPE z) {
+  return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x));
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/generic/lib/shared/max.cl b/generic/lib/shared/max.cl
index 5a48537..1c4457c 100644
--- a/generic/lib/shared/max.cl
+++ b/generic/lib/shared/max.cl
@@ -1,11 +1,11 @@
 #include <clc/clc.h>
 
-#define BODY <max.inc>
+#define __CLC_BODY <max.inc>
 #include <clc/integer/gentype.inc>
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY <max.inc>
+#define __CLC_BODY <max.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/lib/shared/max.inc b/generic/lib/shared/max.inc
index 6a12b6f..75a24c0 100644
--- a/generic/lib/shared/max.inc
+++ b/generic/lib/shared/max.inc
@@ -1,9 +1,9 @@
-_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, GENTYPE b) {
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b) {
   return (a > b ? a : b);
 }
 
-#ifndef SCALAR
-_CLC_OVERLOAD _CLC_DEF GENTYPE max(GENTYPE a, SCALAR_GENTYPE b) {
-  return (a > (GENTYPE)b ? a : (GENTYPE)b);
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) {
+  return (a > (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/generic/lib/shared/min.cl b/generic/lib/shared/min.cl
index 49481cb..433087a 100644
--- a/generic/lib/shared/min.cl
+++ b/generic/lib/shared/min.cl
@@ -1,11 +1,11 @@
 #include <clc/clc.h>
 
-#define BODY <min.inc>
+#define __CLC_BODY <min.inc>
 #include <clc/integer/gentype.inc>
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 
-#define BODY <min.inc>
+#define __CLC_BODY <min.inc>
 #include <clc/math/gentype.inc>
diff --git a/generic/lib/shared/min.inc b/generic/lib/shared/min.inc
index 58a22e1..fe42864 100644
--- a/generic/lib/shared/min.inc
+++ b/generic/lib/shared/min.inc
@@ -1,9 +1,9 @@
-_CLC_OVERLOAD _CLC_DEF GENTYPE min(GENTYPE a, GENTYPE b) {
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) {
   return (a < b ? a : b);
 }
 
-#ifndef SCALAR
-_CLC_OVERLOAD _CLC_DEF GENTYPE min(GENTYPE a, SCALAR_GENTYPE b) {
-  return (a < (GENTYPE)b ? a : (GENTYPE)b);
+#ifndef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) {
+  return (a < (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b);
 }
 #endif
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
index f6ebd37..4dd7918 100644
--- a/generic/lib/shared/vload.cl
+++ b/generic/lib/shared/vload.cl
@@ -21,11 +21,11 @@
     return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \
   } \
 
-#define VLOAD_ADDR_SPACES(SCALAR_GENTYPE) \
-    VLOAD_VECTORIZE(SCALAR_GENTYPE, __private) \
-    VLOAD_VECTORIZE(SCALAR_GENTYPE, __local) \
-    VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
-    VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
+    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
+    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
+    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
+    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
 
 //int/uint are special... see below
 #define VLOAD_TYPES() \
@@ -93,4 +93,4 @@ _CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
 }
 _CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
   return __clc_vload16_uint__global(offset, x);
-}
\ No newline at end of file
+}
diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
index 5b84f47..17c2c4c 100644
--- a/generic/lib/shared/vstore.cl
+++ b/generic/lib/shared/vstore.cl
@@ -29,10 +29,10 @@
     vstore8(vec.hi, offset+8, mem); \
   } \
 
-#define VSTORE_ADDR_SPACES(SCALAR_GENTYPE) \
-    VSTORE_VECTORIZE(SCALAR_GENTYPE, __private) \
-    VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
-    VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
+    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
+    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
+    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
 
 //int/uint are special... see below
 #define VSTORE_TYPES() \
-- 
cgit v1.2.3


From 045f1a8fe7dbd917fe12c9415dac47047f3a5a0b Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 8 Jul 2013 17:27:13 +0000
Subject: Implement mad24() and mul24() builtins

Reviewed-by: Aaron Watry <awatry@gmail.com>

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@185839 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h             |  2 ++
 generic/include/clc/integer/mad24.h   |  3 +++
 generic/include/clc/integer/mad24.inc |  1 +
 generic/include/clc/integer/mul24.h   |  3 +++
 generic/include/clc/integer/mul24.inc |  1 +
 generic/lib/SOURCES                   |  2 ++
 generic/lib/integer/mad24.cl          |  4 ++++
 generic/lib/integer/mad24.inc         |  3 +++
 generic/lib/integer/mul24.cl          |  4 ++++
 generic/lib/integer/mul24.inc         | 11 +++++++++++
 10 files changed, 34 insertions(+)
 create mode 100644 generic/include/clc/integer/mad24.h
 create mode 100644 generic/include/clc/integer/mad24.inc
 create mode 100644 generic/include/clc/integer/mul24.h
 create mode 100644 generic/include/clc/integer/mul24.inc
 create mode 100644 generic/lib/integer/mad24.cl
 create mode 100644 generic/lib/integer/mad24.inc
 create mode 100644 generic/lib/integer/mul24.cl
 create mode 100644 generic/lib/integer/mul24.inc

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 1ce97ad..dfdf747 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -64,6 +64,8 @@
 #include <clc/integer/abs_diff.h>
 #include <clc/integer/add_sat.h>
 #include <clc/integer/clz.h>
+#include <clc/integer/mad24.h>
+#include <clc/integer/mul24.h>
 #include <clc/integer/rotate.h>
 #include <clc/integer/sub_sat.h>
 
diff --git a/generic/include/clc/integer/mad24.h b/generic/include/clc/integer/mad24.h
new file mode 100644
index 0000000..0c120fa
--- /dev/null
+++ b/generic/include/clc/integer/mad24.h
@@ -0,0 +1,3 @@
+#define __CLC_BODY <clc/integer/mad24.inc>
+#include <clc/integer/integer-gentype.inc>
+#undef __CLC_BODY
diff --git a/generic/include/clc/integer/mad24.inc b/generic/include/clc/integer/mad24.inc
new file mode 100644
index 0000000..81fe0c2
--- /dev/null
+++ b/generic/include/clc/integer/mad24.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z);
diff --git a/generic/include/clc/integer/mul24.h b/generic/include/clc/integer/mul24.h
new file mode 100644
index 0000000..4f97098
--- /dev/null
+++ b/generic/include/clc/integer/mul24.h
@@ -0,0 +1,3 @@
+#define __CLC_BODY <clc/integer/mul24.inc>
+#include <clc/integer/integer-gentype.inc>
+#undef __CLC_BODY
diff --git a/generic/include/clc/integer/mul24.inc b/generic/include/clc/integer/mul24.inc
new file mode 100644
index 0000000..8cbf7c1
--- /dev/null
+++ b/generic/include/clc/integer/mul24.inc
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 8cda14a..c2da3d7 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -11,6 +11,8 @@ integer/add_sat_impl.ll
 integer/clz.cl
 integer/clz_if.ll
 integer/clz_impl.ll
+integer/mad24.cl
+integer/mul24.cl
 integer/rotate.cl
 integer/sub_sat.cl
 integer/sub_sat_if.ll
diff --git a/generic/lib/integer/mad24.cl b/generic/lib/integer/mad24.cl
new file mode 100644
index 0000000..e29e99f
--- /dev/null
+++ b/generic/lib/integer/mad24.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <mad24.inc>
+#include <clc/integer/integer-gentype.inc>
diff --git a/generic/lib/integer/mad24.inc b/generic/lib/integer/mad24.inc
new file mode 100644
index 0000000..902b0aa
--- /dev/null
+++ b/generic/lib/integer/mad24.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z){
+  return mul24(x, y) + z;
+}
diff --git a/generic/lib/integer/mul24.cl b/generic/lib/integer/mul24.cl
new file mode 100644
index 0000000..8aedca6
--- /dev/null
+++ b/generic/lib/integer/mul24.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+
+#define __CLC_BODY <mul24.inc>
+#include <clc/integer/integer-gentype.inc>
diff --git a/generic/lib/integer/mul24.inc b/generic/lib/integer/mul24.inc
new file mode 100644
index 0000000..95a2f1d
--- /dev/null
+++ b/generic/lib/integer/mul24.inc
@@ -0,0 +1,11 @@
+
+// We need to use shifts here in order to mantain the sign bit for signed
+// integers.  The compiler should optimize this to (x & 0x00FFFFFF) for
+// unsigned integers.
+#define CONVERT_TO_24BIT(x) (((x) << 8) >> 8)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y){
+  return CONVERT_TO_24BIT(x) * CONVERT_TO_24BIT(y);
+}
+
+#undef CONVERT_TO_24BIT
-- 
cgit v1.2.3


From ce72a50d6f71a6273e60a8fce36fa41dea728cb2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 15 Jul 2013 15:20:05 +0000
Subject: Add integer-gentype.inc: Missing file from r185839

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@186326 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/integer/integer-gentype.inc | 39 +++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 generic/include/clc/integer/integer-gentype.inc

diff --git a/generic/include/clc/integer/integer-gentype.inc b/generic/include/clc/integer/integer-gentype.inc
new file mode 100644
index 0000000..6470eb3
--- /dev/null
+++ b/generic/include/clc/integer/integer-gentype.inc
@@ -0,0 +1,39 @@
+#define __CLC_GENTYPE int
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE int2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE int4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE int8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE int16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE uint
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE uint2
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE uint4
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE uint8
+#include __CLC_BODY
+#undef __CLC_GENTYPE
+
+#define __CLC_GENTYPE uint16
+#include __CLC_BODY
+#undef __CLC_GENTYPE
-- 
cgit v1.2.3


From cfdac80e2cb66d091cf0b70cd0a0c1f258d14005 Mon Sep 17 00:00:00 2001
From: Aaron Watry <awatry@gmail.com>
Date: Tue, 16 Jul 2013 14:28:58 +0000
Subject: libclc: vload/vstore disable assembly and fix offset calculation

This commit gets us back to pure CLC and fixes offset calculations.

The next commit will re-enable the assembly implementation for R600,
fix bugs related to 64-bit address spaces, and also fix the
incorrect assumption that address space identifiers are the same in
all architectures.

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@186415 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/SOURCES             |  2 --
 generic/lib/shared/vload.cl     | 64 ++++-----------------------------
 generic/lib/shared/vload_if.ll  | 60 -------------------------------
 generic/lib/shared/vstore.cl    | 80 +++++++----------------------------------
 generic/lib/shared/vstore_if.ll | 59 ------------------------------
 5 files changed, 20 insertions(+), 245 deletions(-)
 delete mode 100644 generic/lib/shared/vload_if.ll
 delete mode 100644 generic/lib/shared/vstore_if.ll

diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index c2da3d7..21a7eaa 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -26,10 +26,8 @@ shared/clamp.cl
 shared/max.cl
 shared/min.cl
 shared/vload.cl
-shared/vload_if.ll
 shared/vload_impl.ll
 shared/vstore.cl
-shared/vstore_if.ll
 shared/vstore_impl.ll
 workitem/get_global_id.cl
 workitem/get_global_size.cl
diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
index 4dd7918..6793072 100644
--- a/generic/lib/shared/vload.cl
+++ b/generic/lib/shared/vload.cl
@@ -2,23 +2,23 @@
 
 #define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
   _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
-    return (PRIM_TYPE##2)(x[offset] , x[offset+1]); \
+    return (PRIM_TYPE##2)(x[2*offset] , x[2*offset+1]); \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
-    return (PRIM_TYPE##3)(x[offset] , x[offset+1], x[offset+2]); \
+    return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
-    return (PRIM_TYPE##4)(x[offset], x[offset+1], x[offset+2], x[offset+3]); \
+    return (PRIM_TYPE##4)(x[4*offset], x[4*offset+1], x[4*offset+2], x[4*offset+3]); \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
-    return (PRIM_TYPE##8)(vload4(offset, x), vload4(offset+4, x)); \
+    return (PRIM_TYPE##8)(vload4(0, &x[8*offset]), vload4(1, &x[8*offset])); \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
-    return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \
+    return (PRIM_TYPE##16)(vload8(0, &x[16*offset]), vload8(1, &x[16*offset])); \
   } \
 
 #define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
@@ -27,12 +27,13 @@
     VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
     VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
 
-//int/uint are special... see below
 #define VLOAD_TYPES() \
     VLOAD_ADDR_SPACES(char) \
     VLOAD_ADDR_SPACES(uchar) \
     VLOAD_ADDR_SPACES(short) \
     VLOAD_ADDR_SPACES(ushort) \
+    VLOAD_ADDR_SPACES(int) \
+    VLOAD_ADDR_SPACES(uint) \
     VLOAD_ADDR_SPACES(long) \
     VLOAD_ADDR_SPACES(ulong) \
     VLOAD_ADDR_SPACES(float) \
@@ -43,54 +44,3 @@ VLOAD_TYPES()
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
     VLOAD_ADDR_SPACES(double)
 #endif
-
-VLOAD_VECTORIZE(int, __private)
-VLOAD_VECTORIZE(int, __local)
-VLOAD_VECTORIZE(int, __constant)
-VLOAD_VECTORIZE(uint, __private)
-VLOAD_VECTORIZE(uint, __local)
-VLOAD_VECTORIZE(uint, __constant)
-
-_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
-  return (int2)(x[offset] , x[offset+1]);
-}
-_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
-  return (int3)(vload2(offset, x), x[offset+2]);
-}
-_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
-  return (uint2)(x[offset] , x[offset+1]);
-}
-_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
-  return (uint3)(vload2(offset, x), x[offset+2]);
-}
-        
-/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
- * they aren't actually overridden here
- */
-_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
-_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
-_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
-
-_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
-  return __clc_vload4_int__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
-  return __clc_vload8_int__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
-  return __clc_vload16_int__global(offset, x);
-}
-
-_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
-_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
-_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
-
-_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
-  return __clc_vload4_uint__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
-  return __clc_vload8_uint__global(offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
-  return __clc_vload16_uint__global(offset, x);
-}
diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
deleted file mode 100644
index 2634d37..0000000
--- a/generic/lib/shared/vload_if.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-;Start int global vload
-
-declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
-declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
-declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
-declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
-declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
-
-define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
-  ret <2 x i32> %call
-}
-
-define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
-  ret <3 x i32> %call
-}
-
-define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
-  ret <4 x i32> %call
-}
-
-define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
-  ret <8 x i32> %call
-}
-
-define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
-  ret <16 x i32> %call
-}
-
-
-;Start uint global vload
-
-define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
-  ret <2 x i32> %call
-}
-
-define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
-  ret <3 x i32> %call
-}
-
-define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
-  ret <4 x i32> %call
-}
-
-define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
-  ret <8 x i32> %call
-}
-
-define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
-  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
-  ret <16 x i32> %call
-}
diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
index 17c2c4c..f6d360e 100644
--- a/generic/lib/shared/vstore.cl
+++ b/generic/lib/shared/vstore.cl
@@ -4,29 +4,29 @@
 
 #define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
   _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
-    mem[offset] = vec.s0; \
-    mem[offset+1] = vec.s1; \
+    mem[2*offset] = vec.s0; \
+    mem[2*offset+1] = vec.s1; \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
-    mem[offset] = vec.s0; \
-    mem[offset+1] = vec.s1; \
-    mem[offset+2] = vec.s2; \
+    mem[3*offset] = vec.s0; \
+    mem[3*offset+1] = vec.s1; \
+    mem[3*offset+2] = vec.s2; \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
-    vstore2(vec.lo, offset, mem); \
-    vstore2(vec.hi, offset+2, mem); \
+    vstore2(vec.lo, 0, &mem[offset*4]); \
+    vstore2(vec.hi, 1, &mem[offset*4]); \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
-    vstore4(vec.lo, offset, mem); \
-    vstore4(vec.hi, offset+4, mem); \
+    vstore4(vec.lo, 0, &mem[offset*8]); \
+    vstore4(vec.hi, 1, &mem[offset*8]); \
   } \
 \
   _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
-    vstore8(vec.lo, offset, mem); \
-    vstore8(vec.hi, offset+8, mem); \
+    vstore8(vec.lo, 0, &mem[offset*16]); \
+    vstore8(vec.hi, 1, &mem[offset*16]); \
   } \
 
 #define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
@@ -34,12 +34,13 @@
     VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
     VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
 
-//int/uint are special... see below
 #define VSTORE_TYPES() \
     VSTORE_ADDR_SPACES(char) \
     VSTORE_ADDR_SPACES(uchar) \
     VSTORE_ADDR_SPACES(short) \
     VSTORE_ADDR_SPACES(ushort) \
+    VSTORE_ADDR_SPACES(int) \
+    VSTORE_ADDR_SPACES(uint) \
     VSTORE_ADDR_SPACES(long) \
     VSTORE_ADDR_SPACES(ulong) \
     VSTORE_ADDR_SPACES(float) \
@@ -50,58 +51,3 @@ VSTORE_TYPES()
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
     VSTORE_ADDR_SPACES(double)
 #endif
-
-VSTORE_VECTORIZE(int, __private)
-VSTORE_VECTORIZE(int, __local)
-VSTORE_VECTORIZE(uint, __private)
-VSTORE_VECTORIZE(uint, __local)
-
-_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *mem) {
-    mem[offset] = vec.s0;
-    mem[offset+1] = vec.s1;
-}
-_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
-    mem[offset] = vec.s0;
-    mem[offset+1] = vec.s1;
-    mem[offset+2] = vec.s2;
-}
-_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *mem) {
-    mem[offset] = vec.s0;
-    mem[offset+1] = vec.s1;
-}
-_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
-    mem[offset] = vec.s0;
-    mem[offset+1] = vec.s1;
-    mem[offset+2] = vec.s2;
-}
-
-/*Note: R600 probably doesn't support store <2 x ?> and <3 x ?>... so
- * they aren't actually overridden here... lowest-common-denominator
- */
-_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *);
-_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *);
-_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *);
-
-_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) {
-    __clc_vstore4_int__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) {
-    __clc_vstore8_int__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) {
-    __clc_vstore16_int__global(vec, offset, x);
-}
-
-_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *);
-_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *);
-_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *);
-
-_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) {
-    __clc_vstore4_uint__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) {
-    __clc_vstore8_uint__global(vec, offset, x);
-}
-_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) {
-    __clc_vstore16_uint__global(vec, offset, x);
-}
diff --git a/generic/lib/shared/vstore_if.ll b/generic/lib/shared/vstore_if.ll
deleted file mode 100644
index 30eb552..0000000
--- a/generic/lib/shared/vstore_if.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-;Start int global vstore
-
-declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
-declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
-
-define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-
-;Start uint global vstore
-define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
-
-define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
-  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
-  ret void
-}
\ No newline at end of file
-- 
cgit v1.2.3


From 2b80a46a5b9b0836e7a8cf4e6fbd85c332302398 Mon Sep 17 00:00:00 2001
From: Aaron Watry <awatry@gmail.com>
Date: Tue, 16 Jul 2013 14:29:01 +0000
Subject: Fix and re-enable R600 vload/vstore assembly

The assembly optimizations were making unsafe assumptions about which address
spaces had which identifiers.

Also, fix vload/vstore with 64-bit pointers. This was broken previously on
Radeon SI.

This version still only has assembly versions of int/uint 2/4/8/16 for global
loads and stores on R600, but it does it in a way that would be very easily
extended to private/local/constant and could also be handled easily on other
architectures.

v2: 1) Leave v[load|store]_impl.ll in generic/lib
    2) Remove vload_if.ll and vstore_if.ll interfaces
    3) Fix address+offset calculations
    3) Remove offset from assembly arg list

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@186416 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/lib/shared/vload_impl.ll  |  50 ++++++++----------
 generic/lib/shared/vstore_impl.ll |  41 ++++++---------
 r600/lib/SOURCES                  |   2 +
 r600/lib/shared/vload.cl          |  92 +++++++++++++++++++++++++++++++++
 r600/lib/shared/vstore.cl         | 104 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 233 insertions(+), 56 deletions(-)
 create mode 100644 r600/lib/shared/vload.cl
 create mode 100644 r600/lib/shared/vstore.cl

diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
index ae719e0..2e70e5f 100644
--- a/generic/lib/shared/vload_impl.ll
+++ b/generic/lib/shared/vload_impl.ll
@@ -1,43 +1,33 @@
 ; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
 
-define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
-  %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
-  ret <2 x i32> %4
+define <2 x i32> @__clc_vload2_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <2 x i32> addrspace(1)*
+  %2 = load <2 x i32> addrspace(1)* %1, align 4, !tbaa !3
+  ret <2 x i32> %2
 }
 
-define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
-  %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
-  ret <3 x i32> %4
+define <3 x i32> @__clc_vload3_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <3 x i32> addrspace(1)*
+  %2 = load <3 x i32> addrspace(1)* %1, align 4, !tbaa !3
+  ret <3 x i32> %2
 }
 
-define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
-  %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
-  ret <4 x i32> %4
+define <4 x i32> @__clc_vload4_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <4 x i32> addrspace(1)*
+  %2 = load <4 x i32> addrspace(1)* %1, align 4, !tbaa !3
+  ret <4 x i32> %2
 }
 
-define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
-  %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
-  ret <8 x i32> %4
+define <8 x i32> @__clc_vload8_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <8 x i32> addrspace(1)*
+  %2 = load <8 x i32> addrspace(1)* %1, align 4, !tbaa !3
+  ret <8 x i32> %2
 }
 
-define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
-  %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
-  ret <16 x i32> %4
+define <16 x i32> @__clc_vload16_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <16 x i32> addrspace(1)*
+  %2 = load <16 x i32> addrspace(1)* %1, align 4, !tbaa !3
+  ret <16 x i32> %2
 }
 
 !1 = metadata !{metadata !"char", metadata !5}
diff --git a/generic/lib/shared/vstore_impl.ll b/generic/lib/shared/vstore_impl.ll
index 3baab5e..388bce2 100644
--- a/generic/lib/shared/vstore_impl.ll
+++ b/generic/lib/shared/vstore_impl.ll
@@ -1,46 +1,35 @@
 ; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint
 
-define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
-  store <2 x i32> %vec, <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
+define void @__clc_vstore2_i32__addr1(<2 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <2 x i32> addrspace(1)*
+  store <2 x i32> %vec, <2 x i32> addrspace(1)* %1, align 4, !tbaa !3
   ret void
 }
 
-define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
-  store <3 x i32> %vec, <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
+define void @__clc_vstore3_i32__addr1(<3 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <3 x i32> addrspace(1)*
+  store <3 x i32> %vec, <3 x i32> addrspace(1)* %1, align 4, !tbaa !3
   ret void
 }
 
-define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
-  store <4 x i32> %vec, <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
+define void @__clc_vstore4_i32__addr1(<4 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <4 x i32> addrspace(1)*
+  store <4 x i32> %vec, <4 x i32> addrspace(1)* %1, align 4, !tbaa !3
   ret void
 }
 
-define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
-  store <8 x i32> %vec, <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
+define void @__clc_vstore8_i32__addr1(<8 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <8 x i32> addrspace(1)*
+  store <8 x i32> %vec, <8 x i32> addrspace(1)* %1, align 4, !tbaa !3
   ret void
 }
 
-define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
-  %1 = ptrtoint i32 addrspace(1)* %addr to i32
-  %2 = add i32 %1, %offset
-  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
-  store <16 x i32> %vec, <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
+define void @__clc_vstore16_i32__addr1(<16 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
+  %1 = bitcast i32 addrspace(1)* %addr to <16 x i32> addrspace(1)*
+  store <16 x i32> %vec, <16 x i32> addrspace(1)* %1, align 4, !tbaa !3
   ret void
 }
 
-
 !1 = metadata !{metadata !"char", metadata !5}
 !2 = metadata !{metadata !"short", metadata !5}
 !3 = metadata !{metadata !"int", metadata !5}
diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
index 16ef3ac..87df0b7 100644
--- a/r600/lib/SOURCES
+++ b/r600/lib/SOURCES
@@ -4,3 +4,5 @@ workitem/get_local_id.ll
 workitem/get_global_size.ll
 synchronization/barrier.cl
 synchronization/barrier_impl.ll
+shared/vload.cl
+shared/vstore.cl
\ No newline at end of file
diff --git a/r600/lib/shared/vload.cl b/r600/lib/shared/vload.cl
new file mode 100644
index 0000000..6144dde
--- /dev/null
+++ b/r600/lib/shared/vload.cl
@@ -0,0 +1,92 @@
+#include <clc/clc.h>
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##2)(x[2*offset] , x[2*offset+1]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##4)(x[4*offset], x[4*offset+1], x[4*offset+2], x[4*offset+3]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##8)(vload4(0, &x[8*offset]), vload4(1, &x[8*offset])); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return (PRIM_TYPE##16)(vload8(0, &x[16*offset]), vload8(1, &x[16*offset])); \
+  } \
+
+#define VLOAD_ADDR_SPACES(SCALAR_GENTYPE) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __private) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __local) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
+    VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
+
+//int/uint are special... see below
+#define VLOAD_TYPES() \
+    VLOAD_ADDR_SPACES(char) \
+    VLOAD_ADDR_SPACES(uchar) \
+    VLOAD_ADDR_SPACES(short) \
+    VLOAD_ADDR_SPACES(ushort) \
+    VLOAD_ADDR_SPACES(long) \
+    VLOAD_ADDR_SPACES(ulong) \
+    VLOAD_ADDR_SPACES(float) \
+
+VLOAD_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    VLOAD_ADDR_SPACES(double)
+#endif
+
+//Assembly overrides start here
+
+VLOAD_VECTORIZE(int, __private)
+VLOAD_VECTORIZE(int, __local)
+VLOAD_VECTORIZE(int, __constant)
+VLOAD_VECTORIZE(uint, __private)
+VLOAD_VECTORIZE(uint, __local)
+VLOAD_VECTORIZE(uint, __constant)
+
+_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
+  return (int3)(vload2(0, &x[3*offset]), x[3*offset+2]);
+}
+_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
+  return (uint3)(vload2(0, &x[3*offset]), x[3*offset+2]);
+}
+
+//We only define functions for typeN vloadN(), and then just bitcast the result for unsigned types
+#define _CLC_VLOAD_ASM_DECL(PRIM_TYPE,LLVM_SCALAR_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \
+_CLC_DECL PRIM_TYPE##2 __clc_vload2_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \
+_CLC_DECL PRIM_TYPE##4 __clc_vload4_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \
+_CLC_DECL PRIM_TYPE##8 __clc_vload8_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \
+_CLC_DECL PRIM_TYPE##16 __clc_vload16_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \
+
+#define _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE,S_PRIM_TYPE, LLVM_SCALAR_TYPE,VEC_WIDTH,ADDR_SPACE,ADDR_SPACE_ID) \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##VEC_WIDTH vload##VEC_WIDTH (size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+    return __builtin_astype(__clc_vload##VEC_WIDTH##_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID ((const ADDR_SPACE S_PRIM_TYPE *)&x[VEC_WIDTH * offset]), PRIM_TYPE##VEC_WIDTH); \
+  } \
+
+/*Note: R600 back-end doesn't support load <3 x ?>... so
+ * those functions aren't actually overridden here
+ */
+#define _CLC_VLOAD_ASM_OVERLOAD_SIZES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \
+  _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 2, ADDR_SPACE, ADDR_SPACE_ID) \
+  _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 4, ADDR_SPACE, ADDR_SPACE_ID) \
+  _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 8, ADDR_SPACE, ADDR_SPACE_ID) \
+  _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 16, ADDR_SPACE, ADDR_SPACE_ID) \
+
+#define _CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE) \
+  _CLC_VLOAD_ASM_OVERLOAD_SIZES(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, global, 1) \
+
+#define _CLC_VLOAD_ASM_OVERLOADS() \
+  _CLC_VLOAD_ASM_DECL(int,i32,__global,1) \
+  _CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(int,int,i32) \
+  _CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(uint,int,i32) \
+
+_CLC_VLOAD_ASM_OVERLOADS()
\ No newline at end of file
diff --git a/r600/lib/shared/vstore.cl b/r600/lib/shared/vstore.cl
new file mode 100644
index 0000000..a150849
--- /dev/null
+++ b/r600/lib/shared/vstore.cl
@@ -0,0 +1,104 @@
+#include <clc/clc.h>
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    mem[2*offset] = vec.s0; \
+    mem[2*offset+1] = vec.s1; \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    mem[3*offset] = vec.s0; \
+    mem[3*offset+1] = vec.s1; \
+    mem[3*offset+2] = vec.s2; \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    vstore2(vec.lo, 0, &mem[offset*4]); \
+    vstore2(vec.hi, 1, &mem[offset*4]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    vstore4(vec.lo, 0, &mem[offset*8]); \
+    vstore4(vec.hi, 1, &mem[offset*8]); \
+  } \
+\
+  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+    vstore8(vec.lo, 0, &mem[offset*16]); \
+    vstore8(vec.hi, 1, &mem[offset*16]); \
+  } \
+
+#define VSTORE_ADDR_SPACES(SCALAR_GENTYPE) \
+    VSTORE_VECTORIZE(SCALAR_GENTYPE, __private) \
+    VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
+    VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
+
+//int/uint are special... see below
+#define VSTORE_TYPES() \
+    VSTORE_ADDR_SPACES(char) \
+    VSTORE_ADDR_SPACES(uchar) \
+    VSTORE_ADDR_SPACES(short) \
+    VSTORE_ADDR_SPACES(ushort) \
+    VSTORE_ADDR_SPACES(long) \
+    VSTORE_ADDR_SPACES(ulong) \
+    VSTORE_ADDR_SPACES(float) \
+
+VSTORE_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    VSTORE_ADDR_SPACES(double)
+#endif
+
+VSTORE_VECTORIZE(int, __private)
+VSTORE_VECTORIZE(int, __local)
+VSTORE_VECTORIZE(uint, __private)
+VSTORE_VECTORIZE(uint, __local)
+
+_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
+    mem[3*offset] = vec.s0;
+    mem[3*offset+1] = vec.s1;
+    mem[3*offset+2] = vec.s2;
+}
+_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
+    mem[3*offset] = vec.s0;
+    mem[3*offset+1] = vec.s1;
+    mem[3*offset+2] = vec.s2;
+}
+
+/*Note: R600 doesn't support store <3 x ?>... so
+ * those functions aren't actually overridden here... lowest-common-denominator
+ */
+
+//We only define functions for signed_type vstoreN(), and then just cast the pointers/vectors for unsigned types
+#define _CLC_VSTORE_ASM_DECL(PRIM_TYPE,LLVM_SCALAR_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \
+_CLC_DECL void __clc_vstore2_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##2, ADDR_SPACE PRIM_TYPE *); \
+_CLC_DECL void __clc_vstore4_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##4, ADDR_SPACE PRIM_TYPE *); \
+_CLC_DECL void __clc_vstore8_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##8, ADDR_SPACE PRIM_TYPE *); \
+_CLC_DECL void __clc_vstore16_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##16, ADDR_SPACE PRIM_TYPE *); \
+
+#define _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_SCALAR_TYPE, VEC_WIDTH, ADDR_SPACE, ADDR_SPACE_ID) \
+  _CLC_OVERLOAD _CLC_DEF void vstore##VEC_WIDTH(PRIM_TYPE##VEC_WIDTH vec, size_t offset, ADDR_SPACE PRIM_TYPE *x) { \
+    __clc_vstore##VEC_WIDTH##_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (__builtin_astype(vec, S_PRIM_TYPE##VEC_WIDTH), (ADDR_SPACE S_PRIM_TYPE *)&x[ VEC_WIDTH * offset]); \
+  } \
+
+/*Note: R600 back-end doesn't support load <3 x ?>... so
+ * those functions aren't actually overridden here... When the back-end supports
+ * that, then clean add here, and remove the vstore3 definitions from above.
+ */
+#define _CLC_VSTORE_ASM_OVERLOAD_SIZES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \
+  _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 2, ADDR_SPACE, ADDR_SPACE_ID) \
+  _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 4, ADDR_SPACE, ADDR_SPACE_ID) \
+  _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 8, ADDR_SPACE, ADDR_SPACE_ID) \
+  _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 16, ADDR_SPACE, ADDR_SPACE_ID) \
+
+#define _CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE) \
+  _CLC_VSTORE_ASM_OVERLOAD_SIZES(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, global, 1) \
+
+#define _CLC_VSTORE_ASM_OVERLOADS() \
+  _CLC_VSTORE_ASM_DECL(int,i32,__global,1) \
+  _CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(int,int,i32) \
+  _CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(uint,int,i32) \
+
+_CLC_VSTORE_ASM_OVERLOADS()
\ No newline at end of file
-- 
cgit v1.2.3


From 2e8fa9fcabb2af14f720cfbdc93b54c050b84e66 Mon Sep 17 00:00:00 2001
From: Aaron Watry <awatry@gmail.com>
Date: Thu, 18 Jul 2013 21:24:35 +0000
Subject: Fix build with LLVM 3.4

F_Binary and friends were moved to include/Support/FileSystem.h

v2: Maintain compatibility with LLVM 3.3

Signed-off-by: Aaron Watry <awatry@gmail.com>

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@186610 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/prepare-builtins.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/utils/prepare-builtins.cpp b/utils/prepare-builtins.cpp
index be1624b..4ad21e8 100644
--- a/utils/prepare-builtins.cpp
+++ b/utils/prepare-builtins.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Config/config.h"
 
 using namespace llvm;
 
@@ -66,7 +67,11 @@ int main(int argc, char **argv) {
   std::string ErrorInfo;
   OwningPtr<tool_output_file> Out
   (new tool_output_file(OutputFilename.c_str(), ErrorInfo,
+#if LLVM_VERSION_MAJOR > 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR > 3)
+                        sys::fs::F_Binary));
+#else
                         raw_fd_ostream::F_Binary));
+#endif
   if (!ErrorInfo.empty()) {
     errs() << ErrorInfo << '\n';
     exit(1);
-- 
cgit v1.2.3


From 1489907d7e02ecba7a9b57e3dd6236c4246a921c Mon Sep 17 00:00:00 2001
From: Aaron Watry <awatry@gmail.com>
Date: Fri, 19 Jul 2013 16:44:37 +0000
Subject: Implement generic upsample()

Reduces all vector upsamples down to its scalar components, so probably
not the most efficient thing in the world, but it does what the
spec says it needs to do.

Another possible implementation would be to convert/cast everything as
unsigned if necessary, upsample the input vectors, create the upsampled
value, and then cast back to signed if required.

Signed-off-by: Aaron Watry <awatry@gmail.com>
Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@186691 91177308-0d34-0410-b5e6-96231b3b80d8
---
 generic/include/clc/clc.h              |  1 +
 generic/include/clc/integer/upsample.h | 25 +++++++++++++++++++++++++
 generic/lib/SOURCES                    |  1 +
 generic/lib/integer/upsample.cl        | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 61 insertions(+)
 create mode 100644 generic/include/clc/integer/upsample.h
 create mode 100644 generic/lib/integer/upsample.cl

diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index dfdf747..9a2f443 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -68,6 +68,7 @@
 #include <clc/integer/mul24.h>
 #include <clc/integer/rotate.h>
 #include <clc/integer/sub_sat.h>
+#include <clc/integer/upsample.h>
 
 /* 6.11.2 and 6.11.3 Shared Integer/Math Functions */
 #include <clc/shared/clamp.h>
diff --git a/generic/include/clc/integer/upsample.h b/generic/include/clc/integer/upsample.h
new file mode 100644
index 0000000..127debf
--- /dev/null
+++ b/generic/include/clc/integer/upsample.h
@@ -0,0 +1,25 @@
+#define __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE) \
+    _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo);
+
+#define __CLC_UPSAMPLE_VEC(BGENTYPE, GENTYPE, UGENTYPE) \
+    __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE); \
+    __CLC_UPSAMPLE_DECL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2); \
+    __CLC_UPSAMPLE_DECL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3); \
+    __CLC_UPSAMPLE_DECL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4); \
+    __CLC_UPSAMPLE_DECL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8); \
+    __CLC_UPSAMPLE_DECL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16); \
+
+#define __CLC_UPSAMPLE_TYPES() \
+    __CLC_UPSAMPLE_VEC(short, char, uchar) \
+    __CLC_UPSAMPLE_VEC(ushort, uchar, uchar) \
+    __CLC_UPSAMPLE_VEC(int, short, ushort) \
+    __CLC_UPSAMPLE_VEC(uint, ushort, ushort) \
+    __CLC_UPSAMPLE_VEC(long, int, uint) \
+    __CLC_UPSAMPLE_VEC(ulong, uint, uint) \
+
+__CLC_UPSAMPLE_TYPES()
+
+#undef __CLC_UPSAMPLE_TYPES
+#undef __CLC_UPSAMPLE_DECL
+#undef __CLC_UPSAMPLE_VEC
+
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 21a7eaa..9ac08bd 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -17,6 +17,7 @@ integer/rotate.cl
 integer/sub_sat.cl
 integer/sub_sat_if.ll
 integer/sub_sat_impl.ll
+integer/upsample.cl
 math/fmax.cl
 math/fmin.cl
 math/hypot.cl
diff --git a/generic/lib/integer/upsample.cl b/generic/lib/integer/upsample.cl
new file mode 100644
index 0000000..7301cc3
--- /dev/null
+++ b/generic/lib/integer/upsample.cl
@@ -0,0 +1,34 @@
+#include <clc/clc.h>
+
+#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE) \
+    _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo){ \
+        return ((BGENTYPE)hi << GENSIZE) | lo; \
+    } \
+    _CLC_OVERLOAD _CLC_DECL BGENTYPE##2 upsample(GENTYPE##2 hi, UGENTYPE##2 lo){ \
+        return (BGENTYPE##2){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1)}; \
+    } \
+    _CLC_OVERLOAD _CLC_DECL BGENTYPE##3 upsample(GENTYPE##3 hi, UGENTYPE##3 lo){ \
+        return (BGENTYPE##3){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1), upsample(hi.s2, lo.s2)}; \
+    } \
+    _CLC_OVERLOAD _CLC_DECL BGENTYPE##4 upsample(GENTYPE##4 hi, UGENTYPE##4 lo){ \
+        return (BGENTYPE##4){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
+    } \
+    _CLC_OVERLOAD _CLC_DECL BGENTYPE##8 upsample(GENTYPE##8 hi, UGENTYPE##8 lo){ \
+        return (BGENTYPE##8){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
+    } \
+    _CLC_OVERLOAD _CLC_DECL BGENTYPE##16 upsample(GENTYPE##16 hi, UGENTYPE##16 lo){ \
+        return (BGENTYPE##16){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
+    } \
+
+#define __CLC_UPSAMPLE_TYPES() \
+    __CLC_UPSAMPLE_IMPL(short, char, uchar, 8) \
+    __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8) \
+    __CLC_UPSAMPLE_IMPL(int, short, ushort, 16) \
+    __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16) \
+    __CLC_UPSAMPLE_IMPL(long, int, uint, 32) \
+    __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32) \
+
+__CLC_UPSAMPLE_TYPES()
+
+#undef __CLC_UPSAMPLE_TYPES
+#undef __CLC_UPSAMPLE_IMPL
-- 
cgit v1.2.3