summaryrefslogtreecommitdiff
path: root/utests/vload_bench.cpp
blob: 44c1dba7471af0b221fc951985889c8273dc2b2b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include "utest_helper.hpp"
#include <sys/time.h>

#define N_ITERATIONS 10000

#define T uint8_t
template <typename T>
static double vload_bench(const char *kernelFunc, uint32_t N, uint32_t offset, bool benchMode)
{
  const size_t n = benchMode ? (512 * 1024) : (8 * 1024);
  struct timeval start, end;

  // Setup kernel and buffers
  std::string kernelName = kernelFunc + std::to_string((long long unsigned int)N);
  OCL_CALL (cl_kernel_init, "vload_bench.cl", kernelName.c_str(), SOURCE, NULL);
  //OCL_CREATE_KERNEL("compiler_array");
  buf_data[0] = (T*) malloc(sizeof(T) * n);
  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = i; //rand() & ((1LL << N) - 1);
  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
  free(buf_data[0]);
  buf_data[0] = NULL;

  // Run the kernel
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
  OCL_SET_ARG(2, sizeof(uint32_t), &offset);
  globals[0] = n / ((N + 1) & ~0x1);
  locals[0] = 256;
  if (benchMode)
    gettimeofday(&start, NULL);
  OCL_NDRANGE(1);
  if (benchMode) {
    OCL_FINISH();
    gettimeofday(&end, NULL);
    double elapsed = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_usec - start.tv_usec);
    double bandwidth = (globals[0] * (N_ITERATIONS) * sizeof(T) * N) / (elapsed * 1000.);
    printf("\t%2.1fGB/S\n", bandwidth);
    return bandwidth;
  } else {
    // Check result
    OCL_MAP_BUFFER(0);
    OCL_MAP_BUFFER(1);
    for (uint32_t i = 0; i < globals[0]; ++i) {
      OCL_ASSERT((uint32_t)(((T*)buf_data[0])[i + offset]) == ((uint32_t*)buf_data[1])[i]);
    }
    return 0;
  }
}

#define VLOAD_TEST(T, kT) \
static void vload_test_ ##kT(void) \
{ \
  uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
  for(uint32_t i = 0; i < sizeof(vectorSize); i++) { \
    for(uint32_t offset = 0; offset < vectorSize[i]; offset++) {\
      (void)vload_bench<T>("vload_bench_1" #kT, vectorSize[i], offset, false); \
    }\
  } \
}\
MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(vload_test_ ##kT, true)

#ifndef BUILD_BENCHMARK
VLOAD_TEST(uint8_t, uchar)
VLOAD_TEST(int8_t, char)
VLOAD_TEST(uint16_t, ushort)
VLOAD_TEST(int16_t, short)
VLOAD_TEST(uint32_t, uint)
VLOAD_TEST(int32_t, int)
VLOAD_TEST(float, float)
#endif

#define VLOAD_BENCH(T, kT) \
static double vload_bench_ ##kT(void) \
{ \
  uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
  double totBandwidth = 0; \
  unsigned int j = 0;\
  printf("\n");\
  for(uint32_t i = 0; i < sizeof(vectorSize); i++, j++) { \
    printf("  Vector size %d:\n", vectorSize[i]); \
    uint32_t k = 0;\
    double bandwidthForOneSize = 0;\
    for(uint32_t offset = 0; offset < vectorSize[i]; offset++, k++) {\
      printf("\tOffset %d :", offset); \
      bandwidthForOneSize += vload_bench<T>("vload_bench_10000"  #kT, vectorSize[i], offset, true); \
    }\
    totBandwidth += bandwidthForOneSize / k;\
  } \
  return totBandwidth/j;\
}\
MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(vload_bench_ ##kT, true, "GB/S")

#ifdef BUILD_BENCHMARK
VLOAD_BENCH(uint8_t, uchar)
VLOAD_BENCH(uint16_t, ushort)
VLOAD_BENCH(uint32_t, uint)
#endif