/**************************************************************************
 *
 * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

#include <search.h>
#include <glib.h>

#include "gpgpu_fill.h"
#include "huc_copy.h"
#include "i915/gem_create.h"
#include "i915/gem_mman.h"
#include "intel_blt.h"
#include "igt_aux.h"
#include "igt_syncobj.h"
#include "intel_batchbuffer.h"
#include "intel_bufops.h"
#include "intel_chipset.h"
#include "intel_pat.h"
#include "media_fill.h"
#include "media_spin.h"
#include "sw_sync.h"
#include "veboxcopy.h"
#include "xe/xe_ioctl.h"
#include "xe/xe_query.h"

#define BCS_SWCTRL 0x22200
#define BCS_SRC_Y (1 << 0)
#define BCS_DST_Y (1 << 1)

/**
 * SECTION:intel_batchbuffer
 * @short_description: Batchbuffer and blitter support
 * @title: Batch Buffer
 * @include: igt.h
 *
 * Note that this library's header pulls in the [i-g-t core](igt-gpu-tools-i-g-t-core.html)
 * library as a dependency.
 */

static bool intel_bb_do_tracking;
static IGT_LIST_HEAD(intel_bb_list);
static pthread_mutex_t intel_bb_list_lock = PTHREAD_MUTEX_INITIALIZER;

#define CMD_POLY_STIPPLE_OFFSET       0x7906

#define CHECK_RANGE(x) do { \
	igt_assert_lte(0, (x)); \
	igt_assert_lt((x), (1 << 15)); \
} while (0)

/*
 * pitches are in bytes if the surfaces are linear, number of dwords
 * otherwise
 */
static uint32_t fast_copy_pitch(unsigned int stride, unsigned int tiling)
{
	if (tiling != I915_TILING_NONE)
		return stride / 4;
	else
		return stride;
}

uint32_t fast_copy_dword0(unsigned int src_tiling,
			  unsigned int dst_tiling)
{
	uint32_t dword0 = 0;

	dword0 |= XY_FAST_COPY_BLT;

	switch (src_tiling) {
	case I915_TILING_X:
		dword0 |= XY_FAST_COPY_SRC_TILING_X;
		break;
	case I915_TILING_Y:
	case I915_TILING_4:
	case I915_TILING_Yf:
		dword0 |= XY_FAST_COPY_SRC_TILING_Yb_Yf;
		break;
	case I915_TILING_Ys:
		dword0 |= XY_FAST_COPY_SRC_TILING_Ys;
		break;
	case I915_TILING_NONE:
	default:
		break;
	}

	switch (dst_tiling) {
	case I915_TILING_X:
		dword0 |= XY_FAST_COPY_DST_TILING_X;
		break;
	case I915_TILING_Y:
	case I915_TILING_4:
	case I915_TILING_Yf:
		dword0 |= XY_FAST_COPY_DST_TILING_Yb_Yf;
		break;
	case I915_TILING_Ys:
		dword0 |= XY_FAST_COPY_DST_TILING_Ys;
		break;
	case I915_TILING_NONE:
	default:
		break;
	}

	return dword0;
}

static bool new_tile_y_format(unsigned int tiling)
{
	return tiling == T_YFMAJOR || tiling == T_TILE4;
}

uint32_t fast_copy_dword1(int fd, unsigned int src_tiling,
			  unsigned int dst_tiling,
			  int bpp)
{
	uint32_t dword1 = 0;

	if (blt_fast_copy_supports_tiling(fd, T_YMAJOR)) {
		dword1 |= new_tile_y_format(src_tiling)
				? XY_FAST_COPY_SRC_TILING_Yf : 0;
		dword1 |= new_tile_y_format(dst_tiling)
				? XY_FAST_COPY_DST_TILING_Yf : 0;
	} else {
		/* Always set bits for platforms that don't support legacy TileY */
		dword1 |= XY_FAST_COPY_SRC_TILING_Yf | XY_FAST_COPY_DST_TILING_Yf;
	}

	switch (bpp) {
	case 8:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_8;
		break;
	case 16:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_16;
		break;
	case 32:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_32;
		break;
	case 64:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_64;
		break;
	case 128:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_128;
		break;
	default:
		igt_assert(0);
	}

	return dword1;
}

static void
fill_relocation(struct drm_i915_gem_relocation_entry *reloc,
		uint32_t gem_handle, uint64_t presumed_offset,
		uint32_t delta, /* in bytes */
		uint32_t offset, /* in dwords */
		uint32_t read_domains, uint32_t write_domains)
{
	reloc->target_handle = gem_handle;
	reloc->delta = delta;
	reloc->offset = offset * sizeof(uint32_t);
	reloc->presumed_offset = presumed_offset;
	reloc->read_domains = read_domains;
	reloc->write_domain = write_domains;
}

static void
fill_object(struct drm_i915_gem_exec_object2 *obj,
	    uint32_t gem_handle, uint64_t gem_offset,
	    struct drm_i915_gem_relocation_entry *relocs, uint32_t count)
{
	memset(obj, 0, sizeof(*obj));
	obj->handle = gem_handle;
	obj->offset = gem_offset;
	obj->relocation_count = count;
	obj->relocs_ptr = to_user_pointer(relocs);
}

static uint32_t find_engine(const intel_ctx_cfg_t *cfg, unsigned int class)
{
	unsigned int i;
	uint32_t engine_id = -1;

	for (i = 0; i < cfg->num_engines; i++) {
		if (cfg->engines[i].engine_class == class)
			engine_id = i;
	}

	igt_assert_f(engine_id != -1, "Requested engine not found!\n");

	return engine_id;
}

static void exec_blit(int fd,
		      struct drm_i915_gem_exec_object2 *objs,
		      uint32_t count, uint32_t ctx,
		      const intel_ctx_cfg_t *cfg)
{
	struct drm_i915_gem_execbuffer2 exec;
	uint32_t devid = intel_get_drm_devid(fd);
	uint32_t blt_id = HAS_BLT_RING(devid) ? I915_EXEC_BLT : I915_EXEC_DEFAULT;

	if (cfg)
		blt_id = find_engine(cfg, I915_ENGINE_CLASS_COPY);

	exec = (struct drm_i915_gem_execbuffer2) {
		.buffers_ptr = to_user_pointer(objs),
		.buffer_count = count,
		.flags = blt_id | I915_EXEC_NO_RELOC,
		.rsvd1 = ctx,
	};

	gem_execbuf(fd, &exec);
}

static uint32_t src_copy_dword0(uint32_t src_tiling, uint32_t dst_tiling,
				uint32_t bpp, uint32_t device_gen)
{
	uint32_t dword0 = 0;

	dword0 |= XY_SRC_COPY_BLT_CMD;
	if (bpp == 32)
		dword0 |= XY_SRC_COPY_BLT_WRITE_RGB |
			XY_SRC_COPY_BLT_WRITE_ALPHA;

	if (device_gen >= 4 && src_tiling)
		dword0 |= XY_SRC_COPY_BLT_SRC_TILED;
	if (device_gen >= 4 && dst_tiling)
		dword0 |= XY_SRC_COPY_BLT_DST_TILED;

	return dword0;
}

static uint32_t src_copy_dword1(uint32_t dst_pitch, uint32_t bpp)
{
	uint32_t dword1 = 0;

	switch (bpp) {
	case 8:
		break;
	case 16:
		dword1 |= 1 << 24; /* Only support 565 color */
		break;
	case 32:
		dword1 |= 3 << 24;
		break;
	default:
		igt_assert(0);
	}

	dword1 |= 0xcc << 16;
	dword1 |= dst_pitch;

	return dword1;
}

/**
 * igt_blitter_copy:
 * @fd: file descriptor of the i915 driver
 * @ahnd: handle to an allocator
 * @ctx: context within which execute copy blit
 * @src_handle: GEM handle of the source buffer
 * @src_delta: offset into the source GEM bo, in bytes
 * @src_stride: Stride (in bytes) of the source buffer
 * @src_tiling: Tiling mode of the source buffer
 * @src_x: X coordinate of the source region to copy
 * @src_y: Y coordinate of the source region to copy
 * @src_size: size of the src bo required for allocator and softpin
 * @width: Width of the region to copy
 * @height: Height of the region to copy
 * @bpp: source and destination bits per pixel
 * @dst_handle: GEM handle of the destination buffer
 * @dst_delta: offset into the destination GEM bo, in bytes
 * @dst_stride: Stride (in bytes) of the destination buffer
 * @dst_tiling: Tiling mode of the destination buffer
 * @dst_x: X coordinate of destination
 * @dst_y: Y coordinate of destination
 * @dst_size: size of the dst bo required for allocator and softpin
 *
 * Wrapper API to call appropriate blitter copy function.
 */

void igt_blitter_copy(int fd,
		      uint64_t ahnd,
		      uint32_t ctx,
		      const intel_ctx_cfg_t *cfg,
		      /* src */
		      uint32_t src_handle,
		      uint32_t src_delta,
		      uint32_t src_stride,
		      uint32_t src_tiling,
		      uint32_t src_x, uint32_t src_y,
		      uint64_t src_size,
		      /* size */
		      uint32_t width, uint32_t height,
		      /* bpp */
		      uint32_t bpp,
		      /* dst */
		      uint32_t dst_handle,
		      uint32_t dst_delta,
		      uint32_t dst_stride,
		      uint32_t dst_tiling,
		      uint32_t dst_x, uint32_t dst_y,
		      uint64_t dst_size)
{
	uint32_t devid;

	devid = intel_get_drm_devid(fd);

	if (intel_graphics_ver(devid) >= IP_VER(12, 60))
		igt_blitter_fast_copy__raw(fd, ahnd, ctx, NULL,
					   src_handle, src_delta,
					   src_stride, src_tiling,
					   src_x, src_y, src_size,
					   width, height, bpp,
					   dst_handle, dst_delta,
					   dst_stride, dst_tiling,
					   dst_x, dst_y, dst_size);
	else
		igt_blitter_src_copy(fd, ahnd, ctx, NULL,
				     src_handle, src_delta,
				     src_stride, src_tiling,
				     src_x, src_y, src_size,
				     width, height, bpp,
				     dst_handle, dst_delta,
				     dst_stride, dst_tiling,
				     dst_x, dst_y, dst_size);
}
/**
 * igt_blitter_src_copy:
 * @fd: file descriptor of the i915 driver
 * @ahnd: handle to an allocator
 * @ctx: context within which execute copy blit
 * @cfg: intel_ctx configuration, NULL for default context or legacy mode
 * @src_handle: GEM handle of the source buffer
 * @src_delta: offset into the source GEM bo, in bytes
 * @src_stride: Stride (in bytes) of the source buffer
 * @src_tiling: Tiling mode of the source buffer
 * @src_x: X coordinate of the source region to copy
 * @src_y: Y coordinate of the source region to copy
 * @src_size: size of the src bo required for allocator and softpin
 * @width: Width of the region to copy
 * @height: Height of the region to copy
 * @bpp: source and destination bits per pixel
 * @dst_handle: GEM handle of the destination buffer
 * @dst_delta: offset into the destination GEM bo, in bytes
 * @dst_stride: Stride (in bytes) of the destination buffer
 * @dst_tiling: Tiling mode of the destination buffer
 * @dst_x: X coordinate of destination
 * @dst_y: Y coordinate of destination
 * @dst_size: size of the dst bo required for allocator and softpin
 *
 * Copy @src into @dst using the XY_SRC blit command.
 */
void igt_blitter_src_copy(int fd,
			  uint64_t ahnd,
			  uint32_t ctx,
			  const intel_ctx_cfg_t *cfg,
			  /* src */
			  uint32_t src_handle,
			  uint32_t src_delta,
			  uint32_t src_stride,
			  uint32_t src_tiling,
			  uint32_t src_x, uint32_t src_y,
			  uint64_t src_size,

			  /* size */
			  uint32_t width, uint32_t height,

			  /* bpp */
			  uint32_t bpp,

			  /* dst */
			  uint32_t dst_handle,
			  uint32_t dst_delta,
			  uint32_t dst_stride,
			  uint32_t dst_tiling,
			  uint32_t dst_x, uint32_t dst_y,
			  uint64_t dst_size)
{
	uint32_t batch[32];
	struct drm_i915_gem_exec_object2 objs[3];
	struct drm_i915_gem_relocation_entry relocs[2];
	uint32_t batch_handle;
	uint32_t src_pitch, dst_pitch;
	uint32_t dst_reloc_offset, src_reloc_offset;
	uint32_t gen = intel_gen(intel_get_drm_devid(fd));
	uint64_t batch_offset, src_offset, dst_offset;
	const bool has_64b_reloc = gen >= 8;
	int i = 0;

	batch_handle = gem_create(fd, 4096);
	if (ahnd) {
		src_offset = get_offset(ahnd, src_handle, src_size, 0);
		dst_offset = get_offset(ahnd, dst_handle, dst_size, 0);
		batch_offset = get_offset(ahnd, batch_handle, 4096, 0);
	} else {
		src_offset = 16 << 20;
		dst_offset = ALIGN(src_offset + src_size, 1 << 20);
		batch_offset = ALIGN(dst_offset + dst_size, 1 << 20);
	}

	memset(batch, 0, sizeof(batch));

	igt_assert((src_tiling == I915_TILING_NONE) ||
		   (src_tiling == I915_TILING_X) ||
		   (src_tiling == I915_TILING_Y));
	igt_assert((dst_tiling == I915_TILING_NONE) ||
		   (dst_tiling == I915_TILING_X) ||
		   (dst_tiling == I915_TILING_Y));

	src_pitch = (gen >= 4 && src_tiling) ? src_stride / 4 : src_stride;
	dst_pitch = (gen >= 4 && dst_tiling) ? dst_stride / 4 : dst_stride;

	if (bpp == 64) {
		bpp /= 2;
		width *= 2;
	}

	CHECK_RANGE(src_x); CHECK_RANGE(src_y);
	CHECK_RANGE(dst_x); CHECK_RANGE(dst_y);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x + width); CHECK_RANGE(src_y + height);
	CHECK_RANGE(dst_x + width); CHECK_RANGE(dst_y + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	if ((src_tiling | dst_tiling) >= I915_TILING_Y) {
		unsigned int mask;

		batch[i++] = MI_LOAD_REGISTER_IMM(1);
		batch[i++] = BCS_SWCTRL;

		mask = (BCS_SRC_Y | BCS_DST_Y) << 16;
		if (src_tiling == I915_TILING_Y)
			mask |= BCS_SRC_Y;
		if (dst_tiling == I915_TILING_Y)
			mask |= BCS_DST_Y;
		batch[i++] = mask;
	}

	batch[i] = src_copy_dword0(src_tiling, dst_tiling, bpp, gen);
	batch[i++] |= 6 + 2 * has_64b_reloc;
	batch[i++] = src_copy_dword1(dst_pitch, bpp);
	batch[i++] = (dst_y << 16) | dst_x; /* dst x1,y1 */
	batch[i++] = ((dst_y + height) << 16) | (dst_x + width); /* dst x2,y2 */
	dst_reloc_offset = i;
	batch[i++] = dst_offset + dst_delta; /* dst address lower bits */
	if (has_64b_reloc)
		batch[i++] = (dst_offset + dst_delta) >> 32; /* dst address upper bits */
	batch[i++] = (src_y << 16) | src_x; /* src x1,y1 */
	batch[i++] = src_pitch;
	src_reloc_offset = i;
	batch[i++] = src_offset + src_delta; /* src address lower bits */
	if (has_64b_reloc)
		batch[i++] = (src_offset + src_delta) >> 32; /* src address upper bits */

	if ((src_tiling | dst_tiling) >= I915_TILING_Y) {
		igt_assert(gen >= 6);
		batch[i++] = MI_FLUSH_DW_CMD | 2;
		batch[i++] = 0;
		batch[i++] = 0;
		batch[i++] = 0;

		batch[i++] = MI_LOAD_REGISTER_IMM(1);
		batch[i++] = BCS_SWCTRL;
		batch[i++] = (BCS_SRC_Y | BCS_DST_Y) << 16;
	}

	batch[i++] = MI_BATCH_BUFFER_END;
	batch[i++] = MI_NOOP;

	igt_assert(i <= ARRAY_SIZE(batch));

	gem_write(fd, batch_handle, 0, batch, sizeof(batch));

	fill_relocation(&relocs[0], dst_handle, dst_offset,
			dst_delta, dst_reloc_offset,
			I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
	fill_relocation(&relocs[1], src_handle, src_offset,
			src_delta, src_reloc_offset,
			I915_GEM_DOMAIN_RENDER, 0);

	fill_object(&objs[0], dst_handle, dst_offset, NULL, 0);
	fill_object(&objs[1], src_handle, src_offset, NULL, 0);
	fill_object(&objs[2], batch_handle, batch_offset, relocs, !ahnd ? 2 : 0);

	objs[0].flags |= EXEC_OBJECT_NEEDS_FENCE | EXEC_OBJECT_WRITE;
	objs[1].flags |= EXEC_OBJECT_NEEDS_FENCE;

	if (ahnd) {
		objs[0].flags |= EXEC_OBJECT_PINNED | EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
		objs[1].flags |= EXEC_OBJECT_PINNED | EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
		objs[2].flags |= EXEC_OBJECT_PINNED | EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
	}

	exec_blit(fd, objs, 3, ctx, cfg);

	gem_close(fd, batch_handle);
}

/**
 * igt_blitter_fast_copy__raw:
 * @fd: file descriptor of the i915 driver
 * @ahnd: handle to an allocator
 * @ctx: context within which execute copy blit
 * @cfg: intel_ctx configuration, NULL for default context or legacy mode
 * @src_handle: GEM handle of the source buffer
 * @src_delta: offset into the source GEM bo, in bytes
 * @src_stride: Stride (in bytes) of the source buffer
 * @src_tiling: Tiling mode of the source buffer
 * @src_x: X coordinate of the source region to copy
 * @src_y: Y coordinate of the source region to copy
 * @src_size: size of the src bo required for allocator and softpin
 * @width: Width of the region to copy
 * @height: Height of the region to copy
 * @bpp: source and destination bits per pixel
 * @dst_handle: GEM handle of the destination buffer
 * @dst_delta: offset into the destination GEM bo, in bytes
 * @dst_stride: Stride (in bytes) of the destination buffer
 * @dst_tiling: Tiling mode of the destination buffer
 * @dst_x: X coordinate of destination
 * @dst_y: Y coordinate of destination
 * @dst_size: size of the dst bo required for allocator and softpin
 *
 * Like igt_blitter_fast_copy(), but talking to the kernel directly.
 */
void igt_blitter_fast_copy__raw(int fd,
				uint64_t ahnd,
				uint32_t ctx,
				const intel_ctx_cfg_t *cfg,
				/* src */
				uint32_t src_handle,
				unsigned int src_delta,
				unsigned int src_stride,
				unsigned int src_tiling,
				unsigned int src_x, unsigned src_y,
				uint64_t src_size,

				/* size */
				unsigned int width, unsigned int height,

				/* bpp */
				int bpp,

				/* dst */
				uint32_t dst_handle,
				unsigned dst_delta,
				unsigned int dst_stride,
				unsigned int dst_tiling,
				unsigned int dst_x, unsigned dst_y,
				uint64_t dst_size)
{
	uint32_t batch[12];
	struct drm_i915_gem_exec_object2 objs[3];
	struct drm_i915_gem_relocation_entry relocs[2];
	uint32_t batch_handle;
	uint32_t dword0, dword1;
	uint32_t src_pitch, dst_pitch;
	uint64_t batch_offset, src_offset, dst_offset;
	int i = 0;

	batch_handle = gem_create(fd, 4096);
	if (ahnd) {
		src_offset = get_offset(ahnd, src_handle, src_size, 0);
		dst_offset = get_offset(ahnd, dst_handle, dst_size, 0);
		batch_offset = get_offset(ahnd, batch_handle, 4096, 0);
	} else {
		src_offset = 16 << 20;
		dst_offset = ALIGN(src_offset + src_size, 1 << 20);
		batch_offset = ALIGN(dst_offset + dst_size, 1 << 20);
	}

	src_pitch = fast_copy_pitch(src_stride, src_tiling);
	dst_pitch = fast_copy_pitch(dst_stride, dst_tiling);
	dword0 = fast_copy_dword0(src_tiling, dst_tiling);
	dword1 = fast_copy_dword1(fd, src_tiling, dst_tiling, bpp);

	CHECK_RANGE(src_x); CHECK_RANGE(src_y);
	CHECK_RANGE(dst_x); CHECK_RANGE(dst_y);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x + width); CHECK_RANGE(src_y + height);
	CHECK_RANGE(dst_x + width); CHECK_RANGE(dst_y + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	batch[i++] = dword0;
	batch[i++] = dword1 | dst_pitch;
	batch[i++] = (dst_y << 16) | dst_x; /* dst x1,y1 */
	batch[i++] = ((dst_y + height) << 16) | (dst_x + width); /* dst x2,y2 */
	batch[i++] = dst_offset + dst_delta; /* dst address lower bits */
	batch[i++] = (dst_offset + dst_delta) >> 32; /* dst address upper bits */
	batch[i++] = (src_y << 16) | src_x; /* src x1,y1 */
	batch[i++] = src_pitch;
	batch[i++] = src_offset + src_delta; /* src address lower bits */
	batch[i++] = (src_offset + src_delta) >> 32; /* src address upper bits */
	batch[i++] = MI_BATCH_BUFFER_END;
	batch[i++] = MI_NOOP;

	igt_assert(i == ARRAY_SIZE(batch));

	gem_write(fd, batch_handle, 0, batch, sizeof(batch));

	fill_relocation(&relocs[0], dst_handle, dst_offset, dst_delta, 4,
			I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
	fill_relocation(&relocs[1], src_handle, src_offset, src_delta, 8,
			I915_GEM_DOMAIN_RENDER, 0);

	fill_object(&objs[0], dst_handle, dst_offset, NULL, 0);
	objs[0].flags |= EXEC_OBJECT_WRITE;
	fill_object(&objs[1], src_handle, src_offset, NULL, 0);
	fill_object(&objs[2], batch_handle, batch_offset, relocs, !ahnd ? 2 : 0);

	if (ahnd) {
		objs[0].flags |= EXEC_OBJECT_PINNED;
		objs[1].flags |= EXEC_OBJECT_PINNED;
		objs[2].flags |= EXEC_OBJECT_PINNED;
	}

	exec_blit(fd, objs, 3, ctx, cfg);

	gem_close(fd, batch_handle);
}

/**
 * igt_get_render_copyfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific render copy function pointer for the device
 * specified with @devid. Will return NULL when no render copy function is
 * implemented.
 */
igt_render_copyfunc_t igt_get_render_copyfunc(int devid)
{
	igt_render_copyfunc_t copy = NULL;

	if (IS_METEORLAKE(devid))
		copy = mtl_render_copyfunc;
	else if (IS_DG2(devid))
		copy = gen12p71_render_copyfunc;
	else if (AT_LEAST_GEN(devid, 20))
		copy = xe2_render_copyfunc;
	else if (IS_GEN12(devid))
		copy = gen12_render_copyfunc;
	else if (IS_GEN11(devid))
		copy = gen11_render_copyfunc;
	else if (IS_GEN9(devid) || IS_GEN10(devid))
		copy = gen9_render_copyfunc;
	else if (IS_GEN8(devid))
		copy = gen8_render_copyfunc;
	else if (IS_GEN7(devid))
		copy = gen7_render_copyfunc;
	else if (IS_GEN6(devid))
		copy = gen6_render_copyfunc;
	else if (IS_GEN4(devid) || IS_GEN5(devid))
		copy = gen4_render_copyfunc;
	else if (IS_GEN3(devid))
		copy = gen3_render_copyfunc;
	else if (IS_GEN2(devid))
		copy = gen2_render_copyfunc;

	return copy;
}

igt_vebox_copyfunc_t igt_get_vebox_copyfunc(int devid)
{
	igt_vebox_copyfunc_t copy = NULL;

	if (IS_GEN12(devid))
		copy = gen12_vebox_copyfunc;

	return copy;
}

igt_render_clearfunc_t igt_get_render_clearfunc(int devid)
{
	if (IS_METEORLAKE(devid)) {
		return mtl_render_clearfunc;
	} else if (IS_DG2(devid)) {
		return gen12p71_render_clearfunc;
	} else if (IS_GEN12(devid)) {
		return gen12_render_clearfunc;
	} else {
		return NULL;
	}
}

/**
 * igt_get_media_fillfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific media fill function pointer for the device specified
 * with @devid. Will return NULL when no media fill function is implemented.
 */
igt_fillfunc_t igt_get_media_fillfunc(int devid)
{
	igt_fillfunc_t fill = NULL;

	if (intel_graphics_ver(devid) >= IP_VER(12, 50)) {
		/* current implementation defeatured PIPELINE_MEDIA */
	} else if (IS_GEN12(devid))
		fill = gen12_media_fillfunc;
	else if (IS_GEN9(devid) || IS_GEN10(devid) || IS_GEN11(devid))
		fill = gen9_media_fillfunc;
	else if (IS_GEN8(devid))
		fill = gen8_media_fillfunc;
	else if (IS_GEN7(devid))
		fill = gen7_media_fillfunc;

	return fill;
}

igt_vme_func_t igt_get_media_vme_func(int devid)
{
	igt_vme_func_t fill = NULL;
	const struct intel_device_info *devinfo = intel_get_device_info(devid);

	if (IS_GEN11(devid) && !devinfo->is_elkhartlake && !devinfo->is_jasperlake)
		fill = gen11_media_vme_func;

	return fill;
}

/**
 * igt_get_gpgpu_fillfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific gpgpu fill function pointer for the device specified
 * with @devid. Will return NULL when no gpgpu fill function is implemented.
 */
igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
{
	igt_fillfunc_t fill = NULL;

	if (intel_graphics_ver(devid) >= IP_VER(20, 0))
		fill = xe2lpg_gpgpu_fillfunc;
	else if (IS_METEORLAKE(devid))
		fill = xehp_gpgpu_fillfunc;
	else if (intel_graphics_ver(devid) >= IP_VER(12, 60))
		fill = xehpc_gpgpu_fillfunc;
	else if (intel_graphics_ver(devid) >= IP_VER(12, 50))
		fill = xehp_gpgpu_fillfunc;
	else if (IS_GEN12(devid))
		fill = gen12_gpgpu_fillfunc;
	else if (IS_GEN11(devid))
		fill = gen11_gpgpu_fillfunc;
	else if (IS_GEN9(devid) || IS_GEN10(devid))
		fill = gen9_gpgpu_fillfunc;
	else if (IS_GEN8(devid))
		fill = gen8_gpgpu_fillfunc;
	else if (IS_GEN7(devid))
		fill = gen7_gpgpu_fillfunc;

	return fill;
}

/**
 * igt_get_media_spinfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific media spin function pointer for the device specified
 * with @devid. Will return NULL when no media spin function is implemented.
 */
igt_media_spinfunc_t igt_get_media_spinfunc(int devid)
{
	igt_media_spinfunc_t spin = NULL;

	if (IS_GEN9(devid))
		spin = gen9_media_spinfunc;
	else if (IS_GEN8(devid))
		spin = gen8_media_spinfunc;

	return spin;
}

/* Intel batchbuffer v2 */
static bool intel_bb_debug_tree = false;

/*
 * __reallocate_objects:
 * @ibb: pointer to intel_bb
 *
 * Increases number of objects if necessary.
 */
static void __reallocate_objects(struct intel_bb *ibb)
{
	const uint32_t inc = 4096 / sizeof(*ibb->objects);

	if (ibb->num_objects == ibb->allocated_objects) {
		ibb->objects = realloc(ibb->objects,
				       sizeof(*ibb->objects) *
				       (inc + ibb->allocated_objects));

		igt_assert(ibb->objects);
		ibb->allocated_objects += inc;

		memset(&ibb->objects[ibb->num_objects],	0,
		       inc * sizeof(*ibb->objects));
	}
}

static inline uint64_t __intel_bb_get_offset(struct intel_bb *ibb,
					     uint32_t handle,
					     uint64_t size,
					     uint32_t alignment,
					     uint8_t pat_index)
{
	uint64_t offset;

	if (ibb->enforce_relocs)
		return 0;

	offset = __intel_allocator_alloc(ibb->allocator_handle, handle,
					 size, alignment, pat_index,
					 ALLOC_STRATEGY_NONE);
	igt_assert(offset != ALLOC_INVALID_ADDRESS);

	return offset;
}

/**
 * __intel_bb_create:
 * @fd: drm fd - i915 or xe
 * @ctx: for i915 context id, for xe engine id
 * @vm: for xe vm_id, unused for i915
 * @cfg: for i915 intel_ctx configuration, NULL for default context or legacy mode,
 *       unused for xe
 * @size: size of the batchbuffer
 * @do_relocs: use relocations or allocator
 * @allocator_type: allocator type, must be INTEL_ALLOCATOR_NONE for relocations
 *
 * intel-bb assumes it will work in one of two modes - with relocations or
 * with using allocator (currently RELOC and SIMPLE are implemented).
 * Some description is required to describe how they maintain the addresses.
 *
 * Before entering into each scenarios generic rule is intel-bb keeps objects
 * and their offsets in the internal cache and reuses in subsequent execs.
 *
 * 1. intel-bb with relocations (i915 only)
 *
 * Creating new intel-bb adds handle to cache implicitly and sets its address
 * to 0. Objects added to intel-bb later also have address 0 set for first run.
 * After calling execbuf cache is altered with new addresses. As intel-bb
 * works in reloc mode addresses are only suggestion to the driver and we
 * cannot be sure they won't change at next exec.
 *
 * 2. with allocator (i915 or xe)
 *
 * This mode is valid only for ppgtt. Addresses are acquired from allocator
 * and softpinned (i915) or vm-binded (xe). intel-bb cache must be then
 * coherent with allocator (simple is coherent, reloc partially [doesn't
 * support address reservation]).
 * When we do intel-bb reset with purging cache it has to reacquire addresses
 * from allocator (allocator should return same address - what is true for
 * simple and reloc allocators).
 *
 * If we do reset without purging caches we use addresses from intel-bb cache
 * during execbuf objects construction.
 *
 * If we do reset with purging caches allocator entries are freed as well.
 *
 * __intel_bb_create checks if a context configuration for intel_ctx_t was
 * passed in. If this is the case, it copies the information over to the
 * newly created batch buffer.
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
static struct intel_bb *
__intel_bb_create(int fd, uint32_t ctx, uint32_t vm, const intel_ctx_cfg_t *cfg,
		  uint32_t size, bool do_relocs,
		  uint64_t start, uint64_t end, uint64_t alignment,
		  uint8_t allocator_type, enum allocator_strategy strategy)
{
	struct drm_i915_gem_exec_object2 *object;
	struct intel_bb *ibb = calloc(1, sizeof(*ibb));

	igt_assert(ibb);

	ibb->devid = intel_get_drm_devid(fd);
	ibb->gen = intel_gen(ibb->devid);
	ibb->ctx = ctx;

	ibb->fd = fd;
	ibb->driver = is_i915_device(fd) ? INTEL_DRIVER_I915 :
					   is_xe_device(fd) ? INTEL_DRIVER_XE : 0;
	igt_assert(ibb->driver);

	/*
	 * If we don't have full ppgtt driver can change our addresses
	 * so allocator is useless in this case. Just enforce relocations
	 * for such gens and don't use allocator at all.
	 */
	if (ibb->driver == INTEL_DRIVER_I915) {
		ibb->uses_full_ppgtt = gem_uses_full_ppgtt(fd);

		if (!alignment)
			alignment = gem_detect_safe_alignment(fd);

		ibb->alignment = alignment;
		ibb->gtt_size = gem_aperture_size(fd);
		ibb->handle = gem_create(fd, size);

		if (!ibb->uses_full_ppgtt)
			do_relocs = true;

		/*
		 * For softpin mode allocator has full control over offsets allocation
		 * so we want kernel to not interfere with this.
		 */
		if (do_relocs) {
			ibb->allows_obj_alignment = gem_allows_obj_alignment(fd);
			allocator_type = INTEL_ALLOCATOR_NONE;
		} else {
			/* Use safe start offset instead assuming 0x0 is safe */
			start = max_t(uint64_t, start, gem_detect_safe_start_offset(fd));

			/* if relocs are set we won't use an allocator */
			ibb->allocator_handle =
				intel_allocator_open_full(fd, ctx, start, end,
							  allocator_type,
							  strategy, 0);
		}

		ibb->vm_id = 0;
	} else {
		igt_assert(!do_relocs);

		if (!alignment)
			alignment = xe_get_default_alignment(fd);

		ibb->alignment = alignment;
		size = ALIGN(size + xe_cs_prefetch_size(fd), ibb->alignment);
		ibb->handle = xe_bo_create(fd, 0, size, vram_if_possible(fd, 0),
					   DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);

		/* Limit to 48-bit due to MI_* address limitation */
		ibb->gtt_size = 1ull << min_t(uint32_t, xe_va_bits(fd), 48);
		end = ibb->gtt_size;

		if (!vm) {
			igt_assert_f(!ctx, "No vm provided for engine");
			vm = xe_vm_create(fd, 0, 0);
		}

		ibb->uses_full_ppgtt = true;
		ibb->allocator_handle =
			intel_allocator_open_full(fd, vm, start, end,
						  allocator_type, strategy,
						  ibb->alignment);
		ibb->vm_id = vm;
		ibb->last_engine = ~0U;
	}

	ibb->allocator_type = allocator_type;
	ibb->allocator_strategy = strategy;
	ibb->allocator_start = start;
	ibb->allocator_end = end;
	ibb->enforce_relocs = do_relocs;

	ibb->size = size;
	ibb->batch = calloc(1, size);
	igt_assert(ibb->batch);
	ibb->ptr = ibb->batch;
	ibb->fence = -1;

	/* Cache context configuration */
	if (cfg) {
		ibb->cfg = malloc(sizeof(*cfg));
		igt_assert(ibb->cfg);
		memcpy(ibb->cfg, cfg, sizeof(*cfg));
	}

	if ((ibb->gtt_size - 1) >> 32)
		ibb->supports_48b_address = true;

	object = intel_bb_add_object(ibb, ibb->handle, ibb->size,
				     INTEL_BUF_INVALID_ADDRESS, ibb->alignment,
				     false);
	ibb->batch_offset = object->offset;

	IGT_INIT_LIST_HEAD(&ibb->intel_bufs);

	ibb->refcount = 1;

	if (intel_bb_do_tracking && ibb->allocator_type != INTEL_ALLOCATOR_NONE) {
		pthread_mutex_lock(&intel_bb_list_lock);
		igt_list_add(&ibb->link, &intel_bb_list);
		pthread_mutex_unlock(&intel_bb_list_lock);
	}

	return ibb;
}

/**
 * intel_bb_create_full:
 * @fd: drm fd - i915 or xe
 * @ctx: for i915 context id, for xe engine id
 * @vm: for xe vm_id, unused for i915
 * @cfg: intel_ctx configuration, NULL for default context or legacy mode
 * @size: size of the batchbuffer
 * @start: allocator vm start address
 * @end: allocator vm start address
 * @alignment: alignment to use for allocator, zero for default
 * @allocator_type: allocator type, SIMPLE, RELOC, ...
 * @strategy: allocation strategy
 *
 * Creates bb with context passed in @ctx, size in @size and allocator type
 * in @allocator_type. Relocations are set to false because IGT allocator
 * is used in that case. VM range is passed to allocator (@start and @end)
 * and allocation @strategy (suggestion to allocator about address allocation
 * preferences).
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *intel_bb_create_full(int fd, uint32_t ctx, uint32_t vm,
				      const intel_ctx_cfg_t *cfg, uint32_t size,
				      uint64_t start, uint64_t end,
				      uint64_t alignment, uint8_t allocator_type,
				      enum allocator_strategy strategy)
{
	return __intel_bb_create(fd, ctx, vm, cfg, size, false, start, end,
				 alignment, allocator_type, strategy);
}

/**
 * intel_bb_create_with_allocator:
 * @fd: drm fd - i915 or xe
 * @ctx: for i915 context id, for xe engine id
 * @vm: for xe vm_id, unused for i915
 * @cfg: intel_ctx configuration, NULL for default context or legacy mode
 * @size: size of the batchbuffer
 * @allocator_type: allocator type, SIMPLE, RANDOM, ...
 *
 * Creates bb with context passed in @ctx, size in @size and allocator type
 * in @allocator_type. Relocations are set to false because IGT allocator
 * is used in that case.
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *intel_bb_create_with_allocator(int fd, uint32_t ctx, uint32_t vm,
						const intel_ctx_cfg_t *cfg,
						uint32_t size,
						uint8_t allocator_type)
{
	return __intel_bb_create(fd, ctx, vm, cfg, size, false, 0, 0, 0,
				 allocator_type, ALLOC_STRATEGY_HIGH_TO_LOW);
}

static bool aux_needs_softpin(int fd)
{
	return intel_gen(intel_get_drm_devid(fd)) >= 12;
}

static bool has_ctx_cfg(struct intel_bb *ibb)
{
	return ibb->cfg && ibb->cfg->num_engines > 0;
}

/**
 * intel_bb_create:
 * @fd: drm fd - i915 or xe
 * @size: size of the batchbuffer
 *
 * Creates bb with default context.
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 *
 * Notes:
 *
 * intel_bb must not be created in igt_fixture. The reason is intel_bb
 * "opens" connection to the allocator and when test completes it can
 * leave the allocator in unknown state (mostly for failed tests).
 * As igt_core was armed to reset the allocator infrastructure
 * connection to it inside intel_bb is not valid anymore.
 * Trying to use it leads to catastrofic errors.
 */
struct intel_bb *intel_bb_create(int fd, uint32_t size)
{
	bool relocs = is_i915_device(fd) && gem_has_relocations(fd);

	return __intel_bb_create(fd, 0, 0, NULL, size,
				 relocs && !aux_needs_softpin(fd), 0, 0, 0,
				 INTEL_ALLOCATOR_SIMPLE,
				 ALLOC_STRATEGY_HIGH_TO_LOW);
}

/**
 * intel_bb_create_with_context:
 * @fd: drm fd - i915 or xe
 * @ctx: for i915 context id, for xe engine id
 * @vm: for xe vm_id, unused for i915
 * @cfg: intel_ctx configuration, NULL for default context or legacy mode
 * @size: size of the batchbuffer
 *
 * Creates bb with context passed in @ctx and @cfg configuration (when
 * working with custom engines layout).
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *
intel_bb_create_with_context(int fd, uint32_t ctx, uint32_t vm,
			     const intel_ctx_cfg_t *cfg, uint32_t size)
{
	bool relocs = is_i915_device(fd) && gem_has_relocations(fd);

	return __intel_bb_create(fd, ctx, vm, cfg, size,
				 relocs && !aux_needs_softpin(fd), 0, 0, 0,
				 INTEL_ALLOCATOR_SIMPLE,
				 ALLOC_STRATEGY_HIGH_TO_LOW);
}

/**
 * intel_bb_create_with_relocs:
 * @fd: drm fd - i915
 * @size: size of the batchbuffer
 *
 * Creates bb which will disable passing addresses.
 * This will lead to relocations when objects are not previously pinned.
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *intel_bb_create_with_relocs(int fd, uint32_t size)
{
	igt_require(is_i915_device(fd) && gem_has_relocations(fd));

	return __intel_bb_create(fd, 0, 0, NULL, size, true, 0, 0, 0,
				 INTEL_ALLOCATOR_NONE, ALLOC_STRATEGY_NONE);
}

/**
 * intel_bb_create_with_relocs_and_context:
 * @fd: drm fd - i915
 * @ctx: context
 * @cfg: intel_ctx configuration, NULL for default context or legacy mode
 * @size: size of the batchbuffer
 *
 * Creates bb with default context which will disable passing addresses.
 * This will lead to relocations when objects are not previously pinned.
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *
intel_bb_create_with_relocs_and_context(int fd, uint32_t ctx,
					const intel_ctx_cfg_t *cfg,
					uint32_t size)
{
	igt_require(is_i915_device(fd) && gem_has_relocations(fd));

	return __intel_bb_create(fd, ctx, 0, cfg, size, true, 0, 0, 0,
				 INTEL_ALLOCATOR_NONE, ALLOC_STRATEGY_NONE);
}

/**
 * intel_bb_create_no_relocs:
 * @fd: drm fd
 * @size: size of the batchbuffer
 *
 * Creates bb with disabled relocations.
 * This enables passing addresses and requires pinning objects.
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *intel_bb_create_no_relocs(int fd, uint32_t size)
{
	igt_require(gem_uses_full_ppgtt(fd));

	return __intel_bb_create(fd, 0, 0, NULL, size, false, 0, 0, 0,
				 INTEL_ALLOCATOR_SIMPLE,
				 ALLOC_STRATEGY_HIGH_TO_LOW);
}

static void __intel_bb_destroy_relocations(struct intel_bb *ibb)
{
	uint32_t i;

	/* Free relocations */
	for (i = 0; i < ibb->num_objects; i++) {
		free(from_user_pointer(ibb->objects[i]->relocs_ptr));
		ibb->objects[i]->relocs_ptr = to_user_pointer(NULL);
		ibb->objects[i]->relocation_count = 0;
	}

	ibb->relocs = NULL;
	ibb->num_relocs = 0;
	ibb->allocated_relocs = 0;
}

static void __intel_bb_destroy_objects(struct intel_bb *ibb)
{
	free(ibb->objects);
	ibb->objects = NULL;

	tdestroy(ibb->current, free);
	ibb->current = NULL;

	ibb->num_objects = 0;
	ibb->allocated_objects = 0;
}

static void __intel_bb_destroy_cache(struct intel_bb *ibb)
{
	tdestroy(ibb->root, free);
	ibb->root = NULL;
}

static void __intel_bb_remove_intel_bufs(struct intel_bb *ibb)
{
	struct intel_buf *entry, *tmp;

	igt_list_for_each_entry_safe(entry, tmp, &ibb->intel_bufs, link)
		intel_bb_remove_intel_buf(ibb, entry);
}

/**
 * intel_bb_destroy:
 * @ibb: pointer to intel_bb
 *
 * Frees all relocations / objects allocated during filling the batch.
 */
void intel_bb_destroy(struct intel_bb *ibb)
{
	igt_assert(ibb);

	ibb->refcount--;
	igt_assert_f(ibb->refcount == 0, "Trying to destroy referenced bb!");

	__intel_bb_remove_intel_bufs(ibb);
	__intel_bb_destroy_relocations(ibb);
	__intel_bb_destroy_objects(ibb);
	__intel_bb_destroy_cache(ibb);

	if (ibb->allocator_type != INTEL_ALLOCATOR_NONE) {
		if (intel_bb_do_tracking) {
			pthread_mutex_lock(&intel_bb_list_lock);
			igt_list_del(&ibb->link);
			pthread_mutex_unlock(&intel_bb_list_lock);
		}

		intel_allocator_free(ibb->allocator_handle, ibb->handle);
		intel_allocator_close(ibb->allocator_handle);
	}
	gem_close(ibb->fd, ibb->handle);

	if (ibb->fence >= 0)
		close(ibb->fence);
	if (ibb->engine_syncobj)
		syncobj_destroy(ibb->fd, ibb->engine_syncobj);
	if (ibb->vm_id && !ibb->ctx)
		xe_vm_destroy(ibb->fd, ibb->vm_id);

	free(ibb->batch);
	free(ibb->cfg);
	free(ibb);
}

#define XE_OBJ_SIZE(rsvd1) ((rsvd1) & ~(SZ_4K-1))
#define XE_OBJ_PAT_IDX(rsvd1) ((rsvd1) & (SZ_4K-1))

static struct drm_xe_vm_bind_op *xe_alloc_bind_ops(struct intel_bb *ibb,
						   uint32_t op, uint32_t flags,
						   uint32_t prefetch_region)
{
	struct drm_i915_gem_exec_object2 **objects = ibb->objects;
	struct drm_xe_vm_bind_op *bind_ops, *ops;
	bool set_obj = (op & 0xffff) == DRM_XE_VM_BIND_OP_MAP;

	bind_ops = calloc(ibb->num_objects, sizeof(*bind_ops));
	igt_assert(bind_ops);

	igt_debug("bind_ops: %s\n", set_obj ? "MAP" : "UNMAP");
	for (int i = 0; i < ibb->num_objects; i++) {
		ops = &bind_ops[i];

		if (set_obj)
			ops->obj = objects[i]->handle;

		ops->op = op;
		ops->flags = flags;
		ops->obj_offset = 0;
		ops->addr = objects[i]->offset;
		ops->range = XE_OBJ_SIZE(objects[i]->rsvd1);
		ops->prefetch_mem_region_instance = prefetch_region;
		if (set_obj)
			ops->pat_index = XE_OBJ_PAT_IDX(objects[i]->rsvd1);

		igt_debug("  [%d]: handle: %u, offset: %llx, size: %llx pat_index: %u\n",
			  i, ops->obj, (long long)ops->addr, (long long)ops->range,
			  ops->pat_index);
	}

	return bind_ops;
}

static void __unbind_xe_objects(struct intel_bb *ibb)
{
	struct drm_xe_sync syncs[2] = {
		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ },
		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ, .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
	};
	int ret;

	syncs[0].handle = ibb->engine_syncobj;
	syncs[1].handle = syncobj_create(ibb->fd, 0);

	if (ibb->num_objects > 1) {
		struct drm_xe_vm_bind_op *bind_ops;
		uint32_t op = DRM_XE_VM_BIND_OP_UNMAP;

		bind_ops = xe_alloc_bind_ops(ibb, op, 0, 0);
		xe_vm_bind_array(ibb->fd, ibb->vm_id, 0, bind_ops,
				 ibb->num_objects, syncs, 2);
		free(bind_ops);
	} else {
		igt_debug("bind: UNMAP\n");
		igt_debug("  offset: %llx, size: %llx\n",
			  (long long)ibb->batch_offset, (long long)ibb->size);
		xe_vm_unbind_async(ibb->fd, ibb->vm_id, 0, 0,
				   ibb->batch_offset, ibb->size, syncs, 2);
	}
	ret = syncobj_wait_err(ibb->fd, &syncs[1].handle, 1, INT64_MAX, 0);
	igt_assert_eq(ret, 0);
	syncobj_destroy(ibb->fd, syncs[1].handle);

	ibb->xe_bound = false;
}

/*
 * intel_bb_reset:
 * @ibb: pointer to intel_bb
 * @purge_objects_cache: if true destroy internal execobj and relocs + cache
 *
 * Recreate batch bo when there's no additional reference.
 *
 * When purge_object_cache == true we destroy cache as well as remove intel_buf
 * from intel-bb tracking list. Removing intel_bufs releases their addresses
 * in the allocator.
*/

void intel_bb_reset(struct intel_bb *ibb, bool purge_objects_cache)
{
	uint32_t i;

	if (purge_objects_cache && ibb->refcount > 1)
		igt_warn("Cannot purge objects cache on bb, refcount > 1!");

	/* Someone keeps reference, just exit */
	if (ibb->refcount > 1)
		return;

	/*
	 * To avoid relocation objects previously pinned to high virtual
	 * addresses should keep 48bit flag. Ensure we won't clear it
	 * in the reset path.
	 */
	for (i = 0; i < ibb->num_objects; i++)
		ibb->objects[i]->flags &= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;

	if (ibb->driver == INTEL_DRIVER_XE && ibb->xe_bound)
		__unbind_xe_objects(ibb);

	__intel_bb_destroy_relocations(ibb);
	__intel_bb_destroy_objects(ibb);
	__reallocate_objects(ibb);

	if (purge_objects_cache) {
		__intel_bb_remove_intel_bufs(ibb);
		__intel_bb_destroy_cache(ibb);
	}

	/*
	 * When we use allocators we're in no-reloc mode so we have to free
	 * and reacquire offset (ibb->handle can change in multiprocess
	 * environment). We also have to remove and add it again to
	 * objects and cache tree.
	 */
	if (ibb->allocator_type != INTEL_ALLOCATOR_NONE && !purge_objects_cache)
		intel_bb_remove_object(ibb, ibb->handle, ibb->batch_offset,
				       ibb->size);

	gem_close(ibb->fd, ibb->handle);
	if (ibb->driver == INTEL_DRIVER_I915)
		ibb->handle = gem_create(ibb->fd, ibb->size);
	else
		ibb->handle = xe_bo_create(ibb->fd, 0, ibb->size,
					   vram_if_possible(ibb->fd, 0),
					   DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);

	/* Reacquire offset for RELOC and SIMPLE */
	if (ibb->allocator_type == INTEL_ALLOCATOR_SIMPLE ||
	    ibb->allocator_type == INTEL_ALLOCATOR_RELOC)
		ibb->batch_offset = __intel_bb_get_offset(ibb,
							  ibb->handle,
							  ibb->size,
							  ibb->alignment,
							  DEFAULT_PAT_INDEX);

	intel_bb_add_object(ibb, ibb->handle, ibb->size,
			    ibb->batch_offset,
			    ibb->alignment, false);
	ibb->ptr = ibb->batch;
	memset(ibb->batch, 0, ibb->size);
}

/*
 * intel_bb_sync:
 * @ibb: pointer to intel_bb
 *
 * Waits for bb completion. Returns 0 on success, otherwise errno.
 */
int intel_bb_sync(struct intel_bb *ibb)
{
	int ret;

	if (ibb->fence < 0 && !ibb->engine_syncobj)
		return 0;

	if (ibb->fence >= 0) {
		ret = sync_fence_wait(ibb->fence, -1);
		if (ret == 0) {
			close(ibb->fence);
			ibb->fence = -1;
		}
	} else {
		igt_assert_neq(ibb->engine_syncobj, 0);
		ret = syncobj_wait_err(ibb->fd, &ibb->engine_syncobj,
				       1, INT64_MAX, 0);
	}

	return ret;
}

/*
 * intel_bb_print:
 * @ibb: pointer to intel_bb
 *
 * Prints batch to stdout.
 */
void intel_bb_print(struct intel_bb *ibb)
{
	igt_info("drm fd: %d, gen: %d, devid: %u, debug: %d\n",
		 ibb->fd, ibb->gen, ibb->devid, ibb->debug);
	igt_info("handle: %u, size: %u, batch: %p, ptr: %p\n",
		 ibb->handle, ibb->size, ibb->batch, ibb->ptr);
	igt_info("gtt_size: %" PRIu64 ", supports 48bit: %d\n",
		 ibb->gtt_size, ibb->supports_48b_address);
	igt_info("ctx: %u\n", ibb->ctx);
	igt_info("root: %p\n", ibb->root);
	igt_info("objects: %p, num_objects: %u, allocated obj: %u\n",
		 ibb->objects, ibb->num_objects, ibb->allocated_objects);
	igt_info("relocs: %p, num_relocs: %u, allocated_relocs: %u\n----\n",
		 ibb->relocs, ibb->num_relocs, ibb->allocated_relocs);
}

/*
 * intel_bb_dump:
 * @ibb: pointer to intel_bb
 * @filename: name to which write bb
 * @in_hex: dump bb in hex form
 *
 * Dump batch bo to file.
 */
void intel_bb_dump(struct intel_bb *ibb, const char *filename, bool in_hex)
{
	FILE *out;
	void *ptr;

	/*
	 * Note - for i915/relocations offsets inside batch are not resolved
	 * until intel_bb_exec() will write collected instructions to bb
	 * object. For i915/xe with allocator offsets are already acquired
	 * and bb is complete so there's no need to map.
	 */
	if (ibb->driver == INTEL_DRIVER_I915 && ibb->enforce_relocs)
		ptr = gem_mmap__device_coherent(ibb->fd, ibb->handle, 0,
						ibb->size, PROT_READ);
	else
		ptr = ibb->batch;

	out = fopen(filename, "wb");
	igt_assert(out);

	if (in_hex) {
		for (int i = 0; i < ibb->size / sizeof(uint32_t); i++)
			fprintf(out, "%08x\n", ((uint32_t *)ptr)[i]);
	} else {
		fwrite(ptr, ibb->size, 1, out);
	}
	fclose(out);

	if (ptr != ibb->batch)
		munmap(ptr, ibb->size);
}

/**
 * intel_bb_set_debug:
 * @ibb: pointer to intel_bb
 * @debug: true / false
 *
 * Sets debug to true / false. Execbuf is then called synchronously and
 * object/reloc arrays are printed after execution.
 */
void intel_bb_set_debug(struct intel_bb *ibb, bool debug)
{
	ibb->debug = debug;
}

/**
 * intel_bb_set_dump_base64:
 * @ibb: pointer to intel_bb
 * @dump: true / false
 *
 * Do bb dump as base64 string before execbuf call.
 */
void intel_bb_set_dump_base64(struct intel_bb *ibb, bool dump)
{
	ibb->dump_base64 = dump;
}

static int __compare_objects(const void *p1, const void *p2)
{
	const struct drm_i915_gem_exec_object2 *o1 = p1, *o2 = p2;

	return (int) ((int64_t) o1->handle - (int64_t) o2->handle);
}

static struct drm_i915_gem_exec_object2 *
__add_to_cache(struct intel_bb *ibb, uint32_t handle)
{
	struct drm_i915_gem_exec_object2 **found, *object;

	object = malloc(sizeof(*object));
	igt_assert(object);

	object->handle = handle;
	object->alignment = 0;
	found = tsearch((void *) object, &ibb->root, __compare_objects);

	if (*found == object) {
		memset(object, 0, sizeof(*object));
		object->handle = handle;
		object->offset = INTEL_BUF_INVALID_ADDRESS;
	} else {
		free(object);
		object = *found;
	}

	return object;
}

static bool __remove_from_cache(struct intel_bb *ibb, uint32_t handle)
{
	struct drm_i915_gem_exec_object2 **found, *object;

	object = intel_bb_find_object(ibb, handle);
	if (!object) {
		igt_warn("Object: handle: %u not found\n", handle);
		return false;
	}

	found = tdelete((void *) object, &ibb->root, __compare_objects);
	if (!found)
		return false;

	free(object);

	return true;
}

static int __compare_handles(const void *p1, const void *p2)
{
	return (int) (*(int32_t *) p1 - *(int32_t *) p2);
}

static void __add_to_objects(struct intel_bb *ibb,
			     struct drm_i915_gem_exec_object2 *object)
{
	uint32_t **found, *handle;

	handle = malloc(sizeof(*handle));
	igt_assert(handle);

	*handle = object->handle;
	found = tsearch((void *) handle, &ibb->current, __compare_handles);

	if (*found == handle) {
		__reallocate_objects(ibb);
		igt_assert(ibb->num_objects < ibb->allocated_objects);
		ibb->objects[ibb->num_objects++] = object;
	} else {
		free(handle);
	}
}

static void __remove_from_objects(struct intel_bb *ibb,
				  struct drm_i915_gem_exec_object2 *object)
{
	uint32_t i, **handle, *to_free;
	bool found = false;

	for (i = 0; i < ibb->num_objects; i++) {
		if (ibb->objects[i] == object) {
			found = true;
			break;
		}
	}

	/*
	 * When we reset bb (without purging) we have:
	 * 1. cache which contains all cached objects
	 * 2. objects array which contains only bb object (cleared in reset
	 *    path with bb object added at the end)
	 * So !found is normal situation and no warning is added here.
	 */
	if (!found)
		return;

	ibb->num_objects--;
	if (i < ibb->num_objects)
		memmove(&ibb->objects[i], &ibb->objects[i + 1],
			sizeof(object) * (ibb->num_objects - i));

	handle = tfind((void *) &object->handle,
		       &ibb->current, __compare_handles);
	if (!handle) {
		igt_warn("Object %u doesn't exist in the tree, can't remove",
			 object->handle);
		return;
	}

	to_free = *handle;
	tdelete((void *) &object->handle, &ibb->current, __compare_handles);
	free(to_free);
}

/**
 * __intel_bb_add_object:
 * @ibb: pointer to intel_bb
 * @handle: which handle to add to objects array
 * @size: object size
 * @offset: presumed offset of the object when no relocation is enforced
 * @alignment: alignment of the object, if 0 it will be set to page size
 * @write: does a handle is a render target
 *
 * Function adds or updates execobj slot in bb objects array and
 * in the object tree. When object is a render target it has to
 * be marked with EXEC_OBJECT_WRITE flag.
 */
static struct drm_i915_gem_exec_object2 *
__intel_bb_add_object(struct intel_bb *ibb, uint32_t handle, uint64_t size,
		      uint64_t offset, uint64_t alignment, uint8_t pat_index,
		      bool write)
{
	struct drm_i915_gem_exec_object2 *object;

	igt_assert(INVALID_ADDR(offset) || alignment == 0
		   || ALIGN(offset, alignment) == offset);
	igt_assert(is_power_of_two(alignment));

	if (ibb->driver == INTEL_DRIVER_I915)
		alignment = max_t(uint64_t, alignment, gem_detect_safe_alignment(ibb->fd));
	else
		alignment = max_t(uint64_t, ibb->alignment, alignment);

	object = __add_to_cache(ibb, handle);
	__add_to_objects(ibb, object);

	/*
	 * If object->offset == INVALID_ADDRESS we added freshly object to the
	 * cache. In that case we have two choices:
	 * a) get new offset (passed offset was invalid)
	 * b) use offset passed in the call (valid)
	 */
	if (INVALID_ADDR(object->offset)) {
		if (INVALID_ADDR(offset)) {
			offset = __intel_bb_get_offset(ibb, handle, size,
						       alignment, pat_index);
		} else {
			offset = offset & (roundup_power_of_two(ibb->gtt_size) - 1);

			/*
			 * For simple allocator check entry consistency
			 * - reserve if it is not already allocated.
			 */
			if (ibb->allocator_type == INTEL_ALLOCATOR_SIMPLE) {
				bool allocated, reserved;

				reserved = intel_allocator_reserve_if_not_allocated(ibb->allocator_handle,
										    handle, size, offset,
										    &allocated);
				igt_assert_f(allocated || reserved,
					     "Can't get offset, allocated: %d, reserved: %d\n",
					     allocated, reserved);
			}
		}
	} else {
		/*
		 * This assertion makes sense only when we have to be consistent
		 * with underlying allocator. For relocations and when !ppgtt
		 * we can expect addresses passed by the user can be moved
		 * within the driver.
		 */
		if (ibb->allocator_type == INTEL_ALLOCATOR_SIMPLE)
			igt_assert_f(object->offset == offset,
				     "(pid: %ld) handle: %u, offset not match: %" PRIx64 " <> %" PRIx64 "\n",
				     (long) getpid(), handle,
				     (uint64_t) object->offset,
				     offset);
	}

	object->offset = offset;

	if (write)
		object->flags |= EXEC_OBJECT_WRITE;

	if (ibb->supports_48b_address)
		object->flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;

	if (ibb->uses_full_ppgtt && !ibb->enforce_relocs)
		object->flags |= EXEC_OBJECT_PINNED;

	if (ibb->allows_obj_alignment)
		object->alignment = alignment;

	if (ibb->driver == INTEL_DRIVER_XE) {
		object->alignment = alignment;
		object->rsvd1 = size;
		igt_assert(!XE_OBJ_PAT_IDX(object->rsvd1));

		if (pat_index == DEFAULT_PAT_INDEX)
			pat_index = intel_get_pat_idx_wb(ibb->fd);

		/*
		 * XXX: For now encode the pat_index in the first few bits of
		 * rsvd1. intel_batchbuffer should really stop using the i915
		 * drm_i915_gem_exec_object2 to encode VMA placement
		 * information on xe...
		 */
		object->rsvd1 |= pat_index;
	}

	return object;
}

struct drm_i915_gem_exec_object2 *
intel_bb_add_object(struct intel_bb *ibb, uint32_t handle, uint64_t size,
		    uint64_t offset, uint64_t alignment, bool write)
{
	struct drm_i915_gem_exec_object2 *obj = NULL;

	obj = __intel_bb_add_object(ibb, handle, size, offset,
				    alignment, DEFAULT_PAT_INDEX, write);
	igt_assert(obj);

	return obj;
}

bool intel_bb_remove_object(struct intel_bb *ibb, uint32_t handle,
			    uint64_t offset, uint64_t size)
{
	struct drm_i915_gem_exec_object2 *object;
	bool is_reserved;

	object = intel_bb_find_object(ibb, handle);
	if (!object)
		return false;

	if (ibb->allocator_type != INTEL_ALLOCATOR_NONE) {
		intel_allocator_free(ibb->allocator_handle, handle);
		is_reserved = intel_allocator_is_reserved(ibb->allocator_handle,
							  size, offset);
		if (is_reserved)
			intel_allocator_unreserve(ibb->allocator_handle, handle,
						  size, offset);
	}

	__remove_from_objects(ibb, object);
	__remove_from_cache(ibb, handle);

	return true;
}

static struct drm_i915_gem_exec_object2 *
__intel_bb_add_intel_buf(struct intel_bb *ibb, struct intel_buf *buf,
			 uint64_t alignment, bool write)
{
	struct drm_i915_gem_exec_object2 *obj;

	igt_assert(ibb);
	igt_assert(buf);
	igt_assert(!buf->ibb || buf->ibb == ibb);
	igt_assert(ALIGN(alignment, 4096) == alignment);

	if (!alignment) {
		alignment = 0x1000;

		/*
		 * TODO:
		 * Find out why MTL need special alignment, spec says 32k
		 * is enough for MTL.
		 */
		if (ibb->gen >= 12 && buf->compression)
			alignment = IS_METEORLAKE(ibb->devid) ? 0x100000 : 0x10000;

		/* For gen3 ensure tiled buffers are aligned to power of two size */
		if (ibb->gen == 3 && buf->tiling) {
			alignment = 1024 * 1024;

			while (alignment < buf->surface[0].size)
				alignment <<= 1;
		}
	}

	obj = __intel_bb_add_object(ibb, buf->handle, intel_buf_bo_size(buf),
				    buf->addr.offset, alignment, buf->pat_index,
				    write);
	igt_assert(obj);
	buf->addr.offset = obj->offset;

	if (igt_list_empty(&buf->link)) {
		igt_list_add_tail(&buf->link, &ibb->intel_bufs);
		buf->ibb = ibb;
	} else {
		igt_assert(buf->ibb == ibb);
	}

	return obj;
}

struct drm_i915_gem_exec_object2 *
intel_bb_add_intel_buf(struct intel_bb *ibb, struct intel_buf *buf, bool write)
{
	return __intel_bb_add_intel_buf(ibb, buf, 0, write);
}

struct drm_i915_gem_exec_object2 *
intel_bb_add_intel_buf_with_alignment(struct intel_bb *ibb, struct intel_buf *buf,
				      uint64_t alignment, bool write)
{
	return __intel_bb_add_intel_buf(ibb, buf, alignment, write);
}

bool intel_bb_remove_intel_buf(struct intel_bb *ibb, struct intel_buf *buf)
{
	bool removed;

	igt_assert(ibb);
	igt_assert(buf);
	igt_assert(!buf->ibb || buf->ibb == ibb);

	if (igt_list_empty(&buf->link))
		return false;

	removed = intel_bb_remove_object(ibb, buf->handle,
					 buf->addr.offset,
					 intel_buf_bo_size(buf));
	if (removed) {
		buf->addr.offset = INTEL_BUF_INVALID_ADDRESS;
		buf->ibb = NULL;
		igt_list_del_init(&buf->link);
	}

	return removed;
}

void intel_bb_print_intel_bufs(struct intel_bb *ibb)
{
	struct intel_buf *entry;

	igt_list_for_each_entry(entry, &ibb->intel_bufs, link) {
		igt_info("handle: %u, ibb: %p, offset: %lx\n",
			 entry->handle, entry->ibb,
			 (long) entry->addr.offset);
	}
}

struct drm_i915_gem_exec_object2 *
intel_bb_find_object(struct intel_bb *ibb, uint32_t handle)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;

	found = tfind((void *) &object, &ibb->root, __compare_objects);
	if (!found)
		return NULL;

	return *found;
}

bool
intel_bb_object_set_flag(struct intel_bb *ibb, uint32_t handle, uint64_t flag)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;

	igt_assert_f(ibb->root, "Trying to search in null tree\n");

	found = tfind((void *) &object, &ibb->root, __compare_objects);
	if (!found) {
		igt_warn("Trying to set fence on not found handle: %u\n",
			 handle);
		return false;
	}

	(*found)->flags |= flag;

	return true;
}

bool
intel_bb_object_clear_flag(struct intel_bb *ibb, uint32_t handle, uint64_t flag)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;

	found = tfind((void *) &object, &ibb->root, __compare_objects);
	if (!found) {
		igt_warn("Trying to set fence on not found handle: %u\n",
			 handle);
		return false;
	}

	(*found)->flags &= ~flag;

	return true;
}

/*
 * intel_bb_add_reloc:
 * @ibb: pointer to intel_bb
 * @to_handle: object handle in which do relocation
 * @handle: object handle which address will be taken to patch the @to_handle
 * @read_domains: gem domain bits for the relocation
 * @write_domain: gem domain bit for the relocation
 * @delta: delta value to add to @buffer's gpu address
 * @offset: offset within bb to be patched
 *
 * When relocations are requested function allocates additional relocation slot
 * in reloc array for a handle.
 * Object must be previously added to bb.
 */
static uint64_t intel_bb_add_reloc(struct intel_bb *ibb,
				   uint32_t to_handle,
				   uint32_t handle,
				   uint32_t read_domains,
				   uint32_t write_domain,
				   uint64_t delta,
				   uint64_t offset,
				   uint64_t presumed_offset)
{
	struct drm_i915_gem_relocation_entry *relocs;
	struct drm_i915_gem_exec_object2 *object, *to_object;
	uint32_t i;

	object = intel_bb_find_object(ibb, handle);
	igt_assert(object);

	/* In no-reloc mode we just return the previously assigned address */
	if (!ibb->enforce_relocs)
		goto out;

	/* For ibb we have relocs allocated in chunks */
	if (to_handle == ibb->handle) {
		relocs = ibb->relocs;
		if (ibb->num_relocs == ibb->allocated_relocs) {
			ibb->allocated_relocs += 4096 / sizeof(*relocs);
			relocs = realloc(relocs, sizeof(*relocs) * ibb->allocated_relocs);
			igt_assert(relocs);
			ibb->relocs = relocs;
		}
		i = ibb->num_relocs++;
	} else {
		to_object = intel_bb_find_object(ibb, to_handle);
		igt_assert_f(to_object, "object has to be added to ibb first!\n");

		i = to_object->relocation_count++;
		relocs = from_user_pointer(to_object->relocs_ptr);
		relocs = realloc(relocs, sizeof(*relocs) * to_object->relocation_count);
		to_object->relocs_ptr = to_user_pointer(relocs);
		igt_assert(relocs);
	}

	memset(&relocs[i], 0, sizeof(*relocs));
	relocs[i].target_handle = handle;
	relocs[i].read_domains = read_domains;
	relocs[i].write_domain = write_domain;
	relocs[i].delta = delta;
	relocs[i].offset = offset;
	if (ibb->enforce_relocs)
		relocs[i].presumed_offset = -1;
	else
		relocs[i].presumed_offset = object->offset;

	igt_debug("add reloc: to_handle: %u, handle: %u, r/w: 0x%x/0x%x, "
		  "delta: 0x%" PRIx64 ", "
		  "offset: 0x%" PRIx64 ", "
		  "poffset: %p\n",
		  to_handle, handle, read_domains, write_domain,
		  delta, offset,
		  from_user_pointer(relocs[i].presumed_offset));

out:
	return object->offset;
}

static uint64_t __intel_bb_emit_reloc(struct intel_bb *ibb,
				      uint32_t to_handle,
				      uint32_t to_offset,
				      uint32_t handle,
				      uint32_t read_domains,
				      uint32_t write_domain,
				      uint64_t delta,
				      uint64_t presumed_offset)
{
	uint64_t address;

	igt_assert(ibb);

	address = intel_bb_add_reloc(ibb, to_handle, handle,
				     read_domains, write_domain,
				     delta, to_offset,
				     presumed_offset);

	intel_bb_out(ibb, delta + address);
	if (ibb->gen >= 8)
		intel_bb_out(ibb, (delta + address) >> 32);

	return address;
}

/**
 * intel_bb_emit_reloc:
 * @ibb: pointer to intel_bb
 * @handle: object handle which address will be taken to patch the bb
 * @read_domains: gem domain bits for the relocation
 * @write_domain: gem domain bit for the relocation
 * @delta: delta value to add to @buffer's gpu address
 * @presumed_offset: address of the object in address space. If -1 is passed
 * then final offset of the object will be randomized (for no-reloc bb) or
 * 0 (for reloc bb, in that case reloc.presumed_offset will be -1). In
 * case address is known it should passed in @presumed_offset (for no-reloc).
 * @write: does a handle is a render target
 *
 * Function prepares relocation (execobj if required + reloc) and emits
 * offset in bb. For I915_EXEC_NO_RELOC presumed_offset is a hint we already
 * have object in valid place and relocation step can be skipped in this case.
 *
 * Note: delta is value added to address, mostly used when some instructions
 * require modify-bit set to apply change. Which delta is valid depends
 * on instruction (see instruction specification).
 */
uint64_t intel_bb_emit_reloc(struct intel_bb *ibb,
			     uint32_t handle,
			     uint32_t read_domains,
			     uint32_t write_domain,
			     uint64_t delta,
			     uint64_t presumed_offset)
{
	igt_assert(ibb);

	return __intel_bb_emit_reloc(ibb, ibb->handle, intel_bb_offset(ibb),
				     handle, read_domains, write_domain,
				     delta, presumed_offset);
}

uint64_t intel_bb_emit_reloc_fenced(struct intel_bb *ibb,
				    uint32_t handle,
				    uint32_t read_domains,
				    uint32_t write_domain,
				    uint64_t delta,
				    uint64_t presumed_offset)
{
	uint64_t address;

	address = intel_bb_emit_reloc(ibb, handle, read_domains, write_domain,
				      delta, presumed_offset);

	intel_bb_object_set_flag(ibb, handle, EXEC_OBJECT_NEEDS_FENCE);

	return address;
}

/**
 * intel_bb_offset_reloc:
 * @ibb: pointer to intel_bb
 * @handle: object handle which address will be taken to patch the bb
 * @read_domains: gem domain bits for the relocation
 * @write_domain: gem domain bit for the relocation
 * @offset: offset within bb to be patched
 * @presumed_offset: address of the object in address space. If -1 is passed
 * then final offset of the object will be randomized (for no-reloc bb) or
 * 0 (for reloc bb, in that case reloc.presumed_offset will be -1). In
 * case address is known it should passed in @presumed_offset (for no-reloc).
 *
 * Function prepares relocation (execobj if required + reloc). It it used
 * for editing batchbuffer via modifying structures. It means when we're
 * preparing batchbuffer it is more descriptive to edit the structure
 * than emitting dwords. But it require for some fields to point the
 * relocation. For that case @offset is passed by the user and it points
 * to the offset in bb where the relocation will be applied.
 */
uint64_t intel_bb_offset_reloc(struct intel_bb *ibb,
			       uint32_t handle,
			       uint32_t read_domains,
			       uint32_t write_domain,
			       uint32_t offset,
			       uint64_t presumed_offset)
{
	igt_assert(ibb);

	return intel_bb_add_reloc(ibb, ibb->handle, handle,
				  read_domains, write_domain,
				  0, offset, presumed_offset);
}

uint64_t intel_bb_offset_reloc_with_delta(struct intel_bb *ibb,
					  uint32_t handle,
					  uint32_t read_domains,
					  uint32_t write_domain,
					  uint32_t delta,
					  uint32_t offset,
					  uint64_t presumed_offset)
{
	igt_assert(ibb);

	return intel_bb_add_reloc(ibb, ibb->handle, handle,
				  read_domains, write_domain,
				  delta, offset, presumed_offset);
}

uint64_t intel_bb_offset_reloc_to_object(struct intel_bb *ibb,
					 uint32_t to_handle,
					 uint32_t handle,
					 uint32_t read_domains,
					 uint32_t write_domain,
					 uint32_t delta,
					 uint32_t offset,
					 uint64_t presumed_offset)
{
	igt_assert(ibb);

	return intel_bb_add_reloc(ibb, to_handle, handle,
				  read_domains, write_domain,
				  delta, offset, presumed_offset);
}

/*
 * @intel_bb_set_pxp:
 * @ibb: pointer to intel_bb
 * @new_state: enable or disable pxp session
 * @apptype: pxp session input identifies what type of session to enable
 * @appid: pxp session input provides which appid to use
 *
 * This function merely stores the pxp state and session information to
 * be retrieved and programmed later by supporting libraries such as
 * gen12_render_copy that must program the HW within the same dispatch
 */
void intel_bb_set_pxp(struct intel_bb *ibb, bool new_state,
		      uint32_t apptype, uint32_t appid)
{
	igt_assert(ibb);

	ibb->pxp.enabled = new_state;
	ibb->pxp.apptype = new_state ? apptype : 0;
	ibb->pxp.appid   = new_state ? appid : 0;
}

static void intel_bb_dump_execbuf(struct intel_bb *ibb,
				  struct drm_i915_gem_execbuffer2 *execbuf)
{
	struct drm_i915_gem_exec_object2 *objects;
	struct drm_i915_gem_relocation_entry *relocs, *reloc;
	int i, j;
	uint64_t address;

	igt_debug("execbuf [pid: %ld, fd: %d, ctx: %u]\n",
		  (long) getpid(), ibb->fd, ibb->ctx);
	igt_debug("execbuf batch len: %u, start offset: 0x%x, "
		  "DR1: 0x%x, DR4: 0x%x, "
		  "num clip: %u, clipptr: 0x%llx, "
		  "flags: 0x%llx, rsvd1: 0x%llx, rsvd2: 0x%llx\n",
		  execbuf->batch_len, execbuf->batch_start_offset,
		  execbuf->DR1, execbuf->DR4,
		  execbuf->num_cliprects, execbuf->cliprects_ptr,
		  execbuf->flags, execbuf->rsvd1, execbuf->rsvd2);

	igt_debug("execbuf buffer_count: %d\n", execbuf->buffer_count);
	for (i = 0; i < execbuf->buffer_count; i++) {
		objects = &((struct drm_i915_gem_exec_object2 *)
			    from_user_pointer(execbuf->buffers_ptr))[i];
		relocs = from_user_pointer(objects->relocs_ptr);
		address = objects->offset;
		igt_debug(" [%d] handle: %u, reloc_count: %d, reloc_ptr: %p, "
			  "align: 0x%llx, offset: 0x%" PRIx64 ", flags: 0x%llx, "
			  "rsvd1: 0x%llx, rsvd2: 0x%llx\n",
			  i, objects->handle, objects->relocation_count,
			  relocs,
			  objects->alignment,
			  address,
			  objects->flags,
			  objects->rsvd1, objects->rsvd2);
		if (objects->relocation_count) {
			igt_debug("\texecbuf relocs:\n");
			for (j = 0; j < objects->relocation_count; j++) {
				reloc = &relocs[j];
				address = reloc->presumed_offset;
				igt_debug("\t [%d] target handle: %u, "
					  "offset: 0x%llx, delta: 0x%x, "
					  "presumed_offset: 0x%" PRIx64 ", "
					  "read_domains: 0x%x, "
					  "write_domain: 0x%x\n",
					  j, reloc->target_handle,
					  reloc->offset, reloc->delta,
					  address,
					  reloc->read_domains,
					  reloc->write_domain);
			}
		}
	}
}

static void intel_bb_dump_base64(struct intel_bb *ibb, int linelen)
{
	int outsize;
	gchar *str, *pos;

	igt_info("--- bb ---\n");
	pos = str = g_base64_encode((const guchar *) ibb->batch, ibb->size);
	outsize = strlen(str);

	while (outsize > 0) {
		igt_info("%.*s\n", min(outsize, linelen), pos);
		pos += linelen;
		outsize -= linelen;
	}

	free(str);
}

static void print_node(const void *node, VISIT which, int depth)
{
	const struct drm_i915_gem_exec_object2 *object =
		*(const struct drm_i915_gem_exec_object2 **) node;
	(void) depth;

	switch (which) {
	case preorder:
	case endorder:
		break;

	case postorder:
	case leaf:
		igt_info("\t handle: %u, offset: 0x%" PRIx64 "\n",
			 object->handle, (uint64_t) object->offset);
		break;
	}
}

void intel_bb_dump_cache(struct intel_bb *ibb)
{
	igt_info("[pid: %ld] dump cache\n", (long) getpid());
	twalk(ibb->root, print_node);
}

static struct drm_i915_gem_exec_object2 *
create_objects_array(struct intel_bb *ibb)
{
	struct drm_i915_gem_exec_object2 *objects;
	uint32_t i;

	objects = malloc(sizeof(*objects) * ibb->num_objects);
	igt_assert(objects);

	for (i = 0; i < ibb->num_objects; i++) {
		objects[i] = *(ibb->objects[i]);
		objects[i].offset = CANONICAL(objects[i].offset);
	}

	return objects;
}

static void update_offsets(struct intel_bb *ibb,
			   struct drm_i915_gem_exec_object2 *objects)
{
	struct drm_i915_gem_exec_object2 *object;
	struct intel_buf *entry;
	uint32_t i;

	for (i = 0; i < ibb->num_objects; i++) {
		object = intel_bb_find_object(ibb, objects[i].handle);
		igt_assert(object);

		object->offset = DECANONICAL(objects[i].offset);

		if (i == 0)
			ibb->batch_offset = object->offset;
	}

	igt_list_for_each_entry(entry, &ibb->intel_bufs, link) {
		object = intel_bb_find_object(ibb, entry->handle);
		igt_assert(object);

		if (ibb->allocator_type == INTEL_ALLOCATOR_SIMPLE)
			igt_assert(object->offset == entry->addr.offset);
		else
			entry->addr.offset = object->offset;

		entry->addr.ctx = ibb->ctx;
	}
}

#define LINELEN 76

static int
__xe_bb_exec(struct intel_bb *ibb, uint64_t flags, bool sync)
{
	uint32_t engine = flags & (I915_EXEC_BSD_MASK | I915_EXEC_RING_MASK);
	uint32_t engine_id;
	struct drm_xe_sync syncs[2] = {
		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ, .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ, .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
	};
	struct drm_xe_vm_bind_op *bind_ops;
	void *map;

	igt_assert_eq(ibb->num_relocs, 0);
	igt_assert_eq(ibb->xe_bound, false);

	if (ibb->ctx) {
		engine_id = ibb->ctx;
	} else if (ibb->last_engine != engine) {
		struct drm_xe_engine_class_instance inst = { };

		inst.engine_instance =
			(flags & I915_EXEC_BSD_MASK) >> I915_EXEC_BSD_SHIFT;

		switch (flags & I915_EXEC_RING_MASK) {
		case I915_EXEC_DEFAULT:
		case I915_EXEC_BLT:
			inst.engine_class = DRM_XE_ENGINE_CLASS_COPY;
			break;
		case I915_EXEC_BSD:
			inst.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_DECODE;
			break;
		case I915_EXEC_RENDER:
			if (xe_has_engine_class(ibb->fd, DRM_XE_ENGINE_CLASS_RENDER))
				inst.engine_class = DRM_XE_ENGINE_CLASS_RENDER;
			else
				inst.engine_class = DRM_XE_ENGINE_CLASS_COMPUTE;
			break;
		case I915_EXEC_VEBOX:
			inst.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE;
			break;
		default:
			igt_assert_f(false, "Unknown engine: %x", (uint32_t) flags);
		}
		igt_debug("Run on %s\n", xe_engine_class_string(inst.engine_class));

		if (ibb->engine_id)
			xe_exec_queue_destroy(ibb->fd, ibb->engine_id);

		ibb->engine_id = engine_id =
			xe_exec_queue_create(ibb->fd, ibb->vm_id, &inst, 0);
	} else {
		engine_id = ibb->engine_id;
	}
	ibb->last_engine = engine;

	map = xe_bo_map(ibb->fd, ibb->handle, ibb->size);
	memcpy(map, ibb->batch, ibb->size);
	gem_munmap(map, ibb->size);

	syncs[0].handle = syncobj_create(ibb->fd, 0);
	if (ibb->num_objects > 1) {
		bind_ops = xe_alloc_bind_ops(ibb, DRM_XE_VM_BIND_OP_MAP, 0, 0);
		xe_vm_bind_array(ibb->fd, ibb->vm_id, 0, bind_ops,
				 ibb->num_objects, syncs, 1);
		free(bind_ops);
	} else {
		igt_debug("bind: MAP\n");
		igt_debug("  handle: %u, offset: %llx, size: %llx\n",
			  ibb->handle, (long long)ibb->batch_offset,
			  (long long)ibb->size);
		xe_vm_bind_async(ibb->fd, ibb->vm_id, 0, ibb->handle, 0,
				 ibb->batch_offset, ibb->size, syncs, 1);
	}
	ibb->xe_bound = true;

	syncs[0].flags &= ~DRM_XE_SYNC_FLAG_SIGNAL;
	ibb->engine_syncobj = syncobj_create(ibb->fd, 0);
	syncs[1].handle = ibb->engine_syncobj;

	xe_exec_sync(ibb->fd, engine_id, ibb->batch_offset, syncs, 2);

	if (sync)
		intel_bb_sync(ibb);

	return 0;
}

/*
 * __intel_bb_exec:
 * @ibb: pointer to intel_bb
 * @end_offset: offset of the last instruction in the bb
 * @flags: flags passed directly to execbuf
 * @sync: if true wait for execbuf completion, otherwise caller is responsible
 * to wait for completion
 *
 * Returns: 0 on success, otherwise errno.
 *
 * Note: In this step execobj for bb is allocated and inserted to the objects
 * array.
*/
int __intel_bb_exec(struct intel_bb *ibb, uint32_t end_offset,
			   uint64_t flags, bool sync)
{
	struct drm_i915_gem_execbuffer2 execbuf;
	struct drm_i915_gem_exec_object2 *objects;
	int ret, fence, new_fence;

	ibb->objects[0]->relocs_ptr = to_user_pointer(ibb->relocs);
	ibb->objects[0]->relocation_count = ibb->num_relocs;
	ibb->objects[0]->handle = ibb->handle;
	ibb->objects[0]->offset = ibb->batch_offset;

	gem_write(ibb->fd, ibb->handle, 0, ibb->batch, ibb->size);

	memset(&execbuf, 0, sizeof(execbuf));
	objects = create_objects_array(ibb);
	execbuf.buffers_ptr = to_user_pointer(objects);
	execbuf.buffer_count = ibb->num_objects;
	execbuf.batch_len = end_offset;
	execbuf.rsvd1 = ibb->ctx;
	execbuf.flags = flags | I915_EXEC_BATCH_FIRST | I915_EXEC_FENCE_OUT;
	if (ibb->enforce_relocs)
		execbuf.flags &= ~I915_EXEC_NO_RELOC;
	execbuf.rsvd2 = 0;

	if (ibb->dump_base64)
		intel_bb_dump_base64(ibb, LINELEN);

	/* For debugging on CI, remove in final series */
	intel_bb_dump_execbuf(ibb, &execbuf);

	ret = __gem_execbuf_wr(ibb->fd, &execbuf);
	if (ret) {
		intel_bb_dump_execbuf(ibb, &execbuf);
		free(objects);
		return ret;
	}

	/* Update addresses in the cache */
	update_offsets(ibb, objects);

	/* Save/merge fences */
	fence = execbuf.rsvd2 >> 32;

	if (ibb->fence < 0) {
		ibb->fence = fence;
	} else {
		new_fence = sync_fence_merge(ibb->fence, fence);
		close(ibb->fence);
		close(fence);
		ibb->fence = new_fence;
	}

	if (sync || ibb->debug)
		igt_assert(intel_bb_sync(ibb) == 0);

	if (ibb->debug) {
		intel_bb_dump_execbuf(ibb, &execbuf);
		if (intel_bb_debug_tree) {
			igt_info("\nTree:\n");
			twalk(ibb->root, print_node);
		}
	}

	free(objects);

	return 0;
}

/**
 * intel_bb_exec:
 * @ibb: pointer to intel_bb
 * @end_offset: offset of the last instruction in the bb (for i915)
 * @flags: flags passed directly to execbuf
 * @sync: if true wait for execbuf completion, otherwise caller is responsible
 * to wait for completion
 *
 * Do execbuf on context selected during bb creation. Asserts on failure.
*/
void intel_bb_exec(struct intel_bb *ibb, uint32_t end_offset,
		   uint64_t flags, bool sync)
{
	if (ibb->dump_base64)
		intel_bb_dump_base64(ibb, LINELEN);

	if (ibb->driver == INTEL_DRIVER_I915)
		igt_assert_eq(__intel_bb_exec(ibb, end_offset, flags, sync), 0);
	else
		igt_assert_eq(__xe_bb_exec(ibb, flags, sync), 0);
}

/**
 * intel_bb_get_object_address:
 * @ibb: pointer to intel_bb
 * @handle: object handle
 *
 * When objects addresses are previously pinned and we don't want to relocate
 * we need to acquire them from previous execbuf. Function returns previous
 * object offset for @handle or 0 if object is not found.
 */
uint64_t intel_bb_get_object_offset(struct intel_bb *ibb, uint32_t handle)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;

	igt_assert(ibb);

	found = tfind((void *)&object, &ibb->root, __compare_objects);
	if (!found)
		return INTEL_BUF_INVALID_ADDRESS;

	return (*found)->offset;
}

/*
 * intel_bb_emit_bbe:
 * @ibb: batchbuffer
 *
 * Outputs MI_BATCH_BUFFER_END and ensures batch is properly aligned.
 */
uint32_t intel_bb_emit_bbe(struct intel_bb *ibb)
{
	/* Mark the end of the buffer. */
	intel_bb_out(ibb, MI_BATCH_BUFFER_END);
	intel_bb_ptr_align(ibb, 8);

	return intel_bb_offset(ibb);
}

/*
 * intel_bb_emit_flush_common:
 * @ibb: batchbuffer
 *
 * Emits instructions which completes batch buffer.
 *
 * Returns: offset in batch buffer where there's end of instructions.
 */
uint32_t intel_bb_emit_flush_common(struct intel_bb *ibb)
{
	if (intel_bb_offset(ibb) == 0)
		return 0;

	if (ibb->gen == 5) {
		/*
		 * emit gen5 w/a without batch space checks - we reserve that
		 * already.
		 */
		intel_bb_out(ibb, CMD_POLY_STIPPLE_OFFSET << 16);
		intel_bb_out(ibb, 0);
	}

	/* Round batchbuffer usage to 2 DWORDs. */
	if ((intel_bb_offset(ibb) & 4) == 0)
		intel_bb_out(ibb, 0);

	intel_bb_emit_bbe(ibb);

	return intel_bb_offset(ibb);
}

static void intel_bb_exec_with_ring(struct intel_bb *ibb,uint32_t ring)
{
	intel_bb_exec(ibb, intel_bb_offset(ibb),
		      ring | I915_EXEC_NO_RELOC, false);
	intel_bb_reset(ibb, false);
}

/*
 * intel_bb_flush:
 * @ibb: batchbuffer
 * @ring: ring
 *
 * If batch is not empty emit batch buffer end, execute on ring,
 * then reset the batch.
 */
void intel_bb_flush(struct intel_bb *ibb, uint32_t ring)
{
	if (intel_bb_emit_flush_common(ibb) == 0)
		return;

	intel_bb_exec_with_ring(ibb, ring);
}

/*
 * intel_bb_flush_render:
 * @ibb: batchbuffer
 *
 * If batch is not empty emit batch buffer end, find the render engine id,
 * execute on the ring and reset the batch. Context used to execute
 * is batch context.
 */
void intel_bb_flush_render(struct intel_bb *ibb)
{
	uint32_t ring;

	if (intel_bb_emit_flush_common(ibb) == 0)
		return;

	if (has_ctx_cfg(ibb))
		ring = find_engine(ibb->cfg, I915_ENGINE_CLASS_RENDER);
	else
		ring = I915_EXEC_RENDER;

	intel_bb_exec_with_ring(ibb, ring);
}

/*
 * intel_bb_flush_blit:
 * @ibb: batchbuffer
 *
 * If batch is not empty emit batch buffer end, find a suitable ring
 * (depending on gen and context configuration) and reset the batch.
 * Context used to execute is batch context.
 */
void intel_bb_flush_blit(struct intel_bb *ibb)
{
	uint32_t ring;

	if (intel_bb_emit_flush_common(ibb) == 0)
		return;

	if (has_ctx_cfg(ibb))
		ring = find_engine(ibb->cfg, I915_ENGINE_CLASS_COPY);
	else
		ring = HAS_BLT_RING(ibb->devid) ? I915_EXEC_BLT : I915_EXEC_DEFAULT;

	intel_bb_exec_with_ring(ibb, ring);
}

/*
 * intel_bb_copy_data:
 * @ibb: batchbuffer
 * @data: pointer of data which should be copied into batch
 * @bytes: number of bytes to copy, must be dword multiplied
 * @align: alignment in the batch
 *
 * Function copies @bytes of data pointed by @data into batch buffer.
 */
uint32_t intel_bb_copy_data(struct intel_bb *ibb,
			    const void *data, unsigned int bytes,
			    uint32_t align)
{
	uint32_t *subdata, offset;

	igt_assert((bytes & 3) == 0);

	intel_bb_ptr_align(ibb, align);
	offset = intel_bb_offset(ibb);
	igt_assert(offset + bytes < ibb->size);

	subdata = intel_bb_ptr(ibb);
	memcpy(subdata, data, bytes);
	intel_bb_ptr_add(ibb, bytes);

	return offset;
}

/*
 * intel_bb_blit_start:
 * @ibb: batchbuffer
 * @flags: flags to blit command
 *
 * Function emits XY_SRC_COPY_BLT instruction with size appropriate size
 * which depend on gen.
 */
void intel_bb_blit_start(struct intel_bb *ibb, uint32_t flags)
{
	if (blt_has_xy_src_copy(ibb->fd))
		intel_bb_out(ibb, XY_SRC_COPY_BLT_CMD |
			     XY_SRC_COPY_BLT_WRITE_ALPHA |
			     XY_SRC_COPY_BLT_WRITE_RGB |
			     flags |
			     (6 + 2 * (ibb->gen >= 8)));
	else if (blt_has_fast_copy(ibb->fd))
		intel_bb_out(ibb, XY_FAST_COPY_BLT | flags);
	else
		igt_assert_f(0, "No supported blit command found\n");
}

/*
 * intel_bb_emit_blt_copy:
 * @ibb: batchbuffer
 * @src: source buffer (intel_buf)
 * @src_x1: source x1 position
 * @src_y1: source y1 position
 * @src_pitch: source pitch
 * @dst: destination buffer (intel_buf)
 * @dst_x1: destination x1 position
 * @dst_y1: destination y1 position
 * @dst_pitch: destination pitch
 * @width: width of data to copy
 * @height: height of data to copy
 *
 * Function emits complete blit command.
 */
void intel_bb_emit_blt_copy(struct intel_bb *ibb,
			    struct intel_buf *src,
			    int src_x1, int src_y1, int src_pitch,
			    struct intel_buf *dst,
			    int dst_x1, int dst_y1, int dst_pitch,
			    int width, int height, int bpp)
{
	const unsigned int gen = ibb->gen;
	uint32_t cmd_bits = 0;
	uint32_t br13_bits;
	uint32_t mask;

	igt_assert(bpp*(src_x1 + width) <= 8*src_pitch);
	igt_assert(bpp*(dst_x1 + width) <= 8*dst_pitch);
	igt_assert(src_pitch * (src_y1 + height) <= src->surface[0].size);
	igt_assert(dst_pitch * (dst_y1 + height) <= dst->surface[0].size);

	if (gen >= 4 && src->tiling != I915_TILING_NONE) {
		src_pitch /= 4;
		if (blt_has_xy_src_copy(ibb->fd))
			cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED;
		else if (blt_has_fast_copy(ibb->fd))
			cmd_bits |= fast_copy_dword0(src->tiling, dst->tiling);
		else
			igt_assert_f(0, "No supported blit command found\n");
	}

	if (gen >= 4 && dst->tiling != I915_TILING_NONE) {
		dst_pitch /= 4;
		if (blt_has_xy_src_copy(ibb->fd))
			cmd_bits |= XY_SRC_COPY_BLT_DST_TILED;
		else
			cmd_bits |= fast_copy_dword0(src->tiling, dst->tiling);
	}

	CHECK_RANGE(src_x1); CHECK_RANGE(src_y1);
	CHECK_RANGE(dst_x1); CHECK_RANGE(dst_y1);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x1 + width); CHECK_RANGE(src_y1 + height);
	CHECK_RANGE(dst_x1 + width); CHECK_RANGE(dst_y1 + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	br13_bits = 0;
	if (blt_has_xy_src_copy(ibb->fd)) {
		switch (bpp) {
		case 8:
			break;
		case 16:		/* supporting only RGB565, not ARGB1555 */
			br13_bits |= 1 << 24;
			break;
		case 32:
			br13_bits |= 3 << 24;
			cmd_bits |= (XY_SRC_COPY_BLT_WRITE_ALPHA |
				     XY_SRC_COPY_BLT_WRITE_RGB);
			break;
		default:
			igt_assert_f(0, "Unsupported pixel depth\n");
		}
	} else {
		br13_bits = fast_copy_dword1(ibb->fd, src->tiling, dst->tiling, bpp);
	}

	if ((src->tiling | dst->tiling) >= I915_TILING_Y) {
		intel_bb_out(ibb, MI_LOAD_REGISTER_IMM(1));
		intel_bb_out(ibb, BCS_SWCTRL);

		mask = (BCS_SRC_Y | BCS_DST_Y) << 16;
		if (src->tiling == I915_TILING_Y)
			mask |= BCS_SRC_Y;
		if (dst->tiling == I915_TILING_Y)
			mask |= BCS_DST_Y;
		intel_bb_out(ibb, mask);
	}

	intel_bb_add_intel_buf(ibb, src, false);
	intel_bb_add_intel_buf(ibb, dst, true);

	intel_bb_blit_start(ibb, cmd_bits);
	intel_bb_out(ibb, (br13_bits) |
		     (0xcc << 16) | /* copy ROP */
		     dst_pitch);
	intel_bb_out(ibb, (dst_y1 << 16) | dst_x1); /* dst x1,y1 */
	intel_bb_out(ibb, ((dst_y1 + height) << 16) | (dst_x1 + width)); /* dst x2,y2 */
	intel_bb_emit_reloc_fenced(ibb, dst->handle,
				   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
				   0, dst->addr.offset);
	intel_bb_out(ibb, (src_y1 << 16) | src_x1); /* src x1,y1 */
	intel_bb_out(ibb, src_pitch);
	intel_bb_emit_reloc_fenced(ibb, src->handle,
				   I915_GEM_DOMAIN_RENDER, 0,
				   0, src->addr.offset);

	if (gen >= 6 && src->handle == dst->handle) {
		intel_bb_out(ibb, XY_SETUP_CLIP_BLT_CMD);
		intel_bb_out(ibb, 0);
		intel_bb_out(ibb, 0);
	}

	if ((src->tiling | dst->tiling) >= I915_TILING_Y) {
		igt_assert(ibb->gen >= 6);
		intel_bb_out(ibb, MI_FLUSH_DW_CMD | 2);
		intel_bb_out(ibb, 0);
		intel_bb_out(ibb, 0);
		intel_bb_out(ibb, 0);

		intel_bb_out(ibb, MI_LOAD_REGISTER_IMM(1));
		intel_bb_out(ibb, BCS_SWCTRL);
		intel_bb_out(ibb, (BCS_SRC_Y | BCS_DST_Y) << 16);
	}
}

void intel_bb_blt_copy(struct intel_bb *ibb,
		       struct intel_buf *src,
		       int src_x1, int src_y1, int src_pitch,
		       struct intel_buf *dst,
		       int dst_x1, int dst_y1, int dst_pitch,
		       int width, int height, int bpp)
{
	intel_bb_emit_blt_copy(ibb, src, src_x1, src_y1, src_pitch,
			       dst, dst_x1, dst_y1, dst_pitch,
			       width, height, bpp);
	intel_bb_flush_blit(ibb);
}

/**
 * intel_bb_copy_intel_buf:
 * @batch: batchbuffer object
 * @src: source buffer (intel_buf)
 * @dst: destination libdrm buffer object
 * @size: size of the copy range in bytes
 *
 * Emits a copy operation using blitter commands into the supplied batch.
 * A total of @size bytes from the start of @src is copied
 * over to @dst. Note that @size must be page-aligned.
 */
void intel_bb_copy_intel_buf(struct intel_bb *ibb,
			     struct intel_buf *src, struct intel_buf *dst,
			     long int size)
{
	igt_assert(size % 4096 == 0);

	intel_bb_blt_copy(ibb,
			  src, 0, 0, 4096,
			  dst, 0, 0, 4096,
			  4096/4, size/4096, 32);
}

/**
 * igt_get_huc_copyfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific huc copy function pointer for the device specified
 * with @devid. Will return NULL when no media spin function is implemented.
 */
igt_huc_copyfunc_t igt_get_huc_copyfunc(int devid)
{
	igt_huc_copyfunc_t copy = NULL;

	if (IS_GEN12(devid) || IS_GEN11(devid) || IS_GEN9(devid))
		copy = gen9_huc_copyfunc;

	return copy;
}

/**
 * intel_bb_track:
 * @do_tracking: bool
 *
 * Turn on (true) or off (false) tracking for intel_batchbuffers.
 */
void intel_bb_track(bool do_tracking)
{
	if (intel_bb_do_tracking == do_tracking)
		return;

	if (intel_bb_do_tracking) {
		struct intel_bb *entry, *tmp;

		pthread_mutex_lock(&intel_bb_list_lock);
		igt_list_for_each_entry_safe(entry, tmp, &intel_bb_list, link)
			igt_list_del(&entry->link);
		pthread_mutex_unlock(&intel_bb_list_lock);
	}

	intel_bb_do_tracking = do_tracking;
}

static void __intel_bb_reinit_alloc(struct intel_bb *ibb)
{
	if (ibb->allocator_type == INTEL_ALLOCATOR_NONE)
		return;

	ibb->allocator_handle = intel_allocator_open_full(ibb->fd, ibb->ctx,
							  ibb->allocator_start, ibb->allocator_end,
							  ibb->allocator_type,
							  ibb->allocator_strategy,
							  ibb->alignment);

	intel_bb_reset(ibb, true);
}

/**
 * intel_bb_reinit_allocator:
 *
 * Reinit allocator and get offsets in tracked intel_batchbuffers.
 */
void intel_bb_reinit_allocator(void)
{
	struct intel_bb *iter;

	if (!intel_bb_do_tracking)
		return;

	pthread_mutex_lock(&intel_bb_list_lock);
	igt_list_for_each_entry(iter, &intel_bb_list, link)
		__intel_bb_reinit_alloc(iter);
	pthread_mutex_unlock(&intel_bb_list_lock);
}