/* 
 * Copyright © 2012 Intel Corporation
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library. If not, see <http://www.gnu.org/licenses/>.
 *
 * Author: Benjamin Segovia <benjamin.segovia@intel.com>
 */

#include "cl_command_queue.h"
#include "cl_context.h"
#include "cl_program.h"
#include "cl_kernel.h"
#include "cl_device_id.h"
#include "cl_mem.h"
#include "cl_utils.h"
#include "cl_alloc.h"

#include <assert.h>
#include <stdio.h>
#include <string.h>

static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; }

/* "Varing" payload is the part of the curbe that changes accross threads in the
 *  same work group. Right now, it consists in local IDs and block IPs
 */
static cl_int
cl_set_varying_payload(const cl_kernel ker,
                       char *data,
                       const size_t *local_wk_sz,
                       size_t simd_sz,
                       size_t cst_sz,
                       size_t thread_n)
{
  uint32_t *ids[3] = {NULL,NULL,NULL};
  uint16_t *block_ips = NULL;
  size_t i, j, k, curr = 0;
  int32_t id_offset[3], ip_offset;
  cl_int err = CL_SUCCESS;

  id_offset[0] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
  id_offset[1] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
  id_offset[2] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
  ip_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
  assert(id_offset[0] >= 0 &&
         id_offset[1] >= 0 &&
         id_offset[2] >= 0 &&
         ip_offset >= 0);

  TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
  TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
  TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
  TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));

  /* 0xffff means that the lane is inactivated */
  memset(block_ips, 0xff, sizeof(uint16_t)*thread_n*simd_sz);

  /* Compute the IDs and the block IPs */
  for (k = 0; k < local_wk_sz[2]; ++k)
  for (j = 0; j < local_wk_sz[1]; ++j)
  for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
    ids[0][curr] = i;
    ids[1][curr] = j;
    ids[2][curr] = k;
    block_ips[curr] = 0;
  }

  /* Copy them to the curbe buffer */
  curr = 0;
  for (i = 0; i < thread_n; ++i, data += cst_sz) {
    uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
    uint32_t *ids1 = (uint32_t *) (data + id_offset[1]);
    uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
    uint16_t *ips  = (uint16_t *) (data + ip_offset);
    for (j = 0; j < simd_sz; ++j, ++curr) {
      ids0[j] = ids[0][curr];
      ids1[j] = ids[1][curr];
      ids2[j] = ids[2][curr];
      ips[j] = block_ips[curr];
    }
  }

error:
  return err;
}

static void
cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
{
  /* calculate constant buffer size */
  int32_t arg;
  size_t offset;
  gbe_program prog = ker->program->opaque;
  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
  size_t global_const_size = gbe_program_get_global_constant_size(prog);
  uint32_t constant_buf_size = 0;
  for (arg = 0; arg < arg_n; ++arg) {
    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
      cl_mem mem = ker->args[arg].mem;
      constant_buf_size += ALIGN(mem->size, 4);
    }
  }
  if(global_const_size == 0 && constant_buf_size == 0)
     return;

  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
  cl_buffer_map(bo, 1);
  char * cst_addr = cl_buffer_get_virtual(bo);
  offset = 0;
  if (global_const_size > 0) {
    /* Write the global constant arrays */
    gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
  }
  offset += ALIGN(global_const_size, 4);

  if(global_const_size == 0) {
    /* reserve 4 bytes to get rid of 0 address */
    offset += 4;
  }

  /* upload constant buffer argument */
  int32_t curbe_offset = 0;
  for (arg = 0; arg < arg_n; ++arg) {
    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
      cl_mem mem = ker->args[arg].mem;

      curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
      assert(curbe_offset >= 0);
      *(uint32_t *) (ker->curbe + curbe_offset) = offset;

      cl_buffer_map(mem->bo, 1);
      void * addr = cl_buffer_get_virtual(mem->bo);
      memcpy(cst_addr + offset, addr, mem->size);
      cl_buffer_unmap(mem->bo);
      offset += ALIGN(mem->size, 4);
    }
  }
  cl_buffer_unmap(bo);
}

/* Will return the total amount of slm used */
static int32_t
cl_curbe_fill(cl_kernel ker,
              const uint32_t work_dim,
              const size_t *global_wk_off,
              const size_t *global_wk_sz,
              const size_t *local_wk_sz,
              size_t thread_n)
{
  int32_t offset;
#define UPLOAD(ENUM, VALUE) \
  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \
    *((uint32_t *) (ker->curbe + offset)) = VALUE;
  UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
  UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
  UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
  UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
  UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
  UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
  UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
  UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
  UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
  UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]);
  UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
  UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
  UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
  UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
#undef UPLOAD

  /* Upload sampler information. */
  offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0);
  if (offset >= 0) {
    uint32_t i;
    for(i = 0; i < ker->sampler_sz; i++, offset += 2) {
      *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF;
    }
  }

  /* Write identity for the stack pointer. This is required by the stack pointer
   * computation in the kernel
   */
  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
    const uint32_t simd_sz = gbe_kernel_get_simd_width(ker->opaque);
    uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
    int32_t i;
    for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
  }
  /* Handle the various offsets to SLM */
  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
  /* align so that we kernel argument get good alignment */
  int32_t arg, slm_offset = ALIGN(gbe_kernel_get_slm_size(ker->opaque), 32);
  for (arg = 0; arg < arg_n; ++arg) {
    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
    if (type != GBE_ARG_LOCAL_PTR)
      continue;
    offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
    assert(offset >= 0);
    uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
    *slmptr = slm_offset;
    slm_offset += ker->args[arg].local_sz;
  }

  return slm_offset;
}

static void
cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
{
  cl_context ctx = ker->program->ctx;
  cl_device_id device = ctx->device;
  const int32_t per_lane_stack_sz = gbe_kernel_get_stack_size(ker->opaque);
  const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
  const int32_t sub_value = GBE_STACK_BUFFER;
  const int32_t offset = gbe_kernel_get_curbe_offset(ker->opaque, value, sub_value);
  int32_t stack_sz = per_lane_stack_sz;

  /* No stack required for this kernel */
  if (per_lane_stack_sz == 0)
    return;

  /* The stack size is given for *each* SIMD lane. So, we accordingly compute
   * the size we need for the complete machine
   */
  assert(offset >= 0);
  stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
  stack_sz *= device->max_compute_unit;
  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
}

static void
cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker)
{
  int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);

  cl_gpgpu_set_scratch(gpgpu, scratch_sz);
}

LOCAL cl_int
cl_command_queue_ND_range_gen7(cl_command_queue queue,
                               cl_kernel ker,
                               const uint32_t work_dim,
                               const size_t *global_wk_off,
                               const size_t *global_wk_sz,
                               const size_t *local_wk_sz)
{
  cl_context ctx = queue->ctx;
  cl_gpgpu gpgpu = queue->gpgpu;
  char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
  cl_gpgpu_kernel kernel;
  const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
  size_t i, batch_sz = 0u, local_sz = 0u;
  size_t cst_sz = ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque);
  size_t thread_n = 0u;
  cl_int err = CL_SUCCESS;

  /* Setup kernel */
  kernel.name = "KERNEL";
  kernel.grf_blocks = 128;
  kernel.bo = ker->bo;
  kernel.barrierID = 0;
  kernel.slm_sz = 0;
  kernel.use_slm = gbe_kernel_use_slm(ker->opaque);

  /* Compute the number of HW threads we need */
  TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
  kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
  kernel.curbe_sz = cst_sz;

  /* Curbe step 1: fill the constant urb buffer data shared by all threads */
  if (ker->curbe) {
    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
    if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
      return CL_OUT_OF_RESOURCES;
  }

  /* Setup the kernel */
  if (queue->props & CL_QUEUE_PROFILING_ENABLE)
    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
  else
    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);

  /* Bind user buffers */
  cl_command_queue_bind_surface(queue, ker);
  /* Bind user images */
  cl_command_queue_bind_image(queue, ker);
  /* Bind all samplers */
  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);

  cl_setup_scratch(gpgpu, ker);
  /* Bind a stack if needed */
  cl_bind_stack(gpgpu, ker);

  cl_upload_constant_buffer(queue, ker);

  cl_gpgpu_states_setup(gpgpu, &kernel);

  /* Curbe step 2. Give the localID and upload it to video memory */
  if (ker->curbe) {
    assert(cst_sz > 0);
    TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
    for (i = 0; i < thread_n; ++i) {
        memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
    }
    TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
    cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
  }

  /* Start a new batch buffer */
  batch_sz = cl_kernel_compute_batch_sz(ker);
  cl_gpgpu_batch_reset(gpgpu, batch_sz);
  cl_gpgpu_batch_start(gpgpu);

  /* Issue the GPGPU_WALKER command */
  cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);

  /* Close the batch buffer and submit it */
  cl_gpgpu_batch_end(gpgpu, 0);
error:
  return err;
}