diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
38 files changed, 8232 insertions, 1896 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index e13c67c8d2c0..ac495328dae3 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -5,5 +5,6 @@ config HSA_AMD tristate "HSA kernel driver for AMD GPU devices" depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64 + select DRM_AMDGPU_USERPTR help Enable this if you want to use HSA features on AMD GPU devices. diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 7fc9b0f444cb..c8fa422585ec 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -14,6 +14,6 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ kfd_process_queue_manager.o kfd_device_queue_manager.o \ kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ - kfd_dbgdev.o kfd_dbgmgr.o + kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o obj-$(CONFIG_HSA_AMD) += amdkfd.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 211fc48697fa..02a908249023 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,40 +24,59 @@ #include "kfd_events.h" #include "cik_int.h" -static bool cik_event_interrupt_isr(struct kfd_dev *dev, +static bool is_cpc_vm_fault(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { - unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; - pasid = (ihre->ring_id & 0xffff0000) >> 16; + if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && + ihre->vmid >= dev->vm_info.first_vmid_kfd && + ihre->vmid <= dev->vm_info.last_vmid_kfd) + return true; + return false; +} +static bool cik_event_interrupt_isr(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) +{ + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; /* Do not process in ISR, just request it to be forwarded to WQ. */ - return (pasid != 0) && + return (ihre->pasid != 0) && (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || - ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); + ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || + is_cpc_vm_fault(dev, ih_ring_entry)); } static void cik_event_interrupt_wq(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { - unsigned int pasid; const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; - pasid = (ihre->ring_id & 0xffff0000) >> 16; - - if (pasid == 0) + if (ihre->pasid == 0) return; if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, 0, 0); + kfd_signal_event_interrupt(ihre->pasid, 0, 0); else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8); + kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) - kfd_signal_hw_exception_event(pasid); + kfd_signal_hw_exception_event(ihre->pasid); + else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { + struct kfd_vm_fault_info info; + + dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); + kfd_process_vm_fault(dev->dqm, ihre->pasid); + if (info.vmid == ihre->vmid) + kfd_signal_vm_fault_event(dev, ihre->pasid, &info); + else + kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); + } } const struct kfd_event_interrupt_class event_interrupt_class_cik = { diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h index 79a16d24c1b8..feb3c2428d53 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_int.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h @@ -26,16 +26,30 @@ #include <linux/types.h> struct cik_ih_ring_entry { - uint32_t source_id; - uint32_t data; - uint32_t ring_id; - uint32_t reserved; + uint32_t source_id:8; + uint32_t reserved1:8; + uint32_t reserved2:16; + + uint32_t data:28; + uint32_t reserved3:4; + + /* pipeid, meid and unused3 are officially called RINGID, + * but for our purposes, they always decode into pipe and ME. */ + uint32_t pipeid:2; + uint32_t meid:2; + uint32_t reserved4:4; + uint32_t vmid:8; + uint32_t pasid:16; + + uint32_t reserved5; }; #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF +#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 +#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 #endif diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h index 48769d12dd7b..607fc5ceadbe 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h @@ -23,11 +23,33 @@ #ifndef CIK_REGS_H #define CIK_REGS_H +#define IH_VMID_0_LUT 0x3D40u + +#define BIF_DOORBELL_CNTL 0x530Cu + +#define SRBM_GFX_CNTL 0xE44 +#define PIPEID(x) ((x) << 0) +#define MEID(x) ((x) << 2) +#define VMID(x) ((x) << 4) +#define QUEUEID(x) ((x) << 8) + +#define SQ_CONFIG 0x8C00 + +#define SH_MEM_BASES 0x8C28 /* if PTR32, these are the bases for scratch and lds */ #define PRIVATE_BASE(x) ((x) << 0) /* scratch */ #define SHARED_BASE(x) ((x) << 16) /* LDS */ +#define SH_MEM_APE1_BASE 0x8C2C +/* if PTR32, this is the base location of GPUVM */ +#define SH_MEM_APE1_LIMIT 0x8C30 +/* if PTR32, this is the upper limit of GPUVM */ +#define SH_MEM_CONFIG 0x8C34 #define PTR32 (1 << 0) +#define PRIVATE_ATC (1 << 1) #define ALIGNMENT_MODE(x) ((x) << 2) +#define SH_MEM_ALIGNMENT_MODE_DWORD 0 +#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1 +#define SH_MEM_ALIGNMENT_MODE_STRICT 2 #define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3 #define DEFAULT_MTYPE(x) ((x) << 4) #define APE1_MTYPE(x) ((x) << 7) @@ -36,37 +58,164 @@ #define MTYPE_CACHED 0 #define MTYPE_NONCACHED 3 + +#define SH_STATIC_MEM_CONFIG 0x9604u + +#define TC_CFG_L1_LOAD_POLICY0 0xAC68 +#define TC_CFG_L1_LOAD_POLICY1 0xAC6C +#define TC_CFG_L1_STORE_POLICY 0xAC70 +#define TC_CFG_L2_LOAD_POLICY0 0xAC74 +#define TC_CFG_L2_LOAD_POLICY1 0xAC78 +#define TC_CFG_L2_STORE_POLICY0 0xAC7C +#define TC_CFG_L2_STORE_POLICY1 0xAC80 +#define TC_CFG_L2_ATOMIC_POLICY 0xAC84 +#define TC_CFG_L1_VOLATILE 0xAC88 +#define TC_CFG_L2_VOLATILE 0xAC8C + +#define CP_PQ_WPTR_POLL_CNTL 0xC20C +#define WPTR_POLL_EN (1 << 31) + +#define CPC_INT_CNTL 0xC2D0 +#define CP_ME1_PIPE0_INT_CNTL 0xC214 +#define CP_ME1_PIPE1_INT_CNTL 0xC218 +#define CP_ME1_PIPE2_INT_CNTL 0xC21C +#define CP_ME1_PIPE3_INT_CNTL 0xC220 +#define CP_ME2_PIPE0_INT_CNTL 0xC224 +#define CP_ME2_PIPE1_INT_CNTL 0xC228 +#define CP_ME2_PIPE2_INT_CNTL 0xC22C +#define CP_ME2_PIPE3_INT_CNTL 0xC230 +#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13) +#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17) +#define PRIV_REG_INT_ENABLE (1 << 23) +#define TIME_STAMP_INT_ENABLE (1 << 26) +#define GENERIC2_INT_ENABLE (1 << 29) +#define GENERIC1_INT_ENABLE (1 << 30) +#define GENERIC0_INT_ENABLE (1 << 31) +#define CP_ME1_PIPE0_INT_STATUS 0xC214 +#define CP_ME1_PIPE1_INT_STATUS 0xC218 +#define CP_ME1_PIPE2_INT_STATUS 0xC21C +#define CP_ME1_PIPE3_INT_STATUS 0xC220 +#define CP_ME2_PIPE0_INT_STATUS 0xC224 +#define CP_ME2_PIPE1_INT_STATUS 0xC228 +#define CP_ME2_PIPE2_INT_STATUS 0xC22C +#define CP_ME2_PIPE3_INT_STATUS 0xC230 +#define DEQUEUE_REQUEST_INT_STATUS (1 << 13) +#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17) +#define PRIV_REG_INT_STATUS (1 << 23) +#define TIME_STAMP_INT_STATUS (1 << 26) +#define GENERIC2_INT_STATUS (1 << 29) +#define GENERIC1_INT_STATUS (1 << 30) +#define GENERIC0_INT_STATUS (1 << 31) + +#define CP_HPD_EOP_BASE_ADDR 0xC904 +#define CP_HPD_EOP_BASE_ADDR_HI 0xC908 +#define CP_HPD_EOP_VMID 0xC90C +#define CP_HPD_EOP_CONTROL 0xC910 +#define EOP_SIZE(x) ((x) << 0) +#define EOP_SIZE_MASK (0x3f << 0) +#define CP_MQD_BASE_ADDR 0xC914 +#define CP_MQD_BASE_ADDR_HI 0xC918 +#define CP_HQD_ACTIVE 0xC91C +#define CP_HQD_VMID 0xC920 + +#define CP_HQD_PERSISTENT_STATE 0xC924u #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) #define PRELOAD_REQ (1 << 0) -#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) - -#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) - -#define IB_ATC_EN (1U << 23) - +#define CP_HQD_PIPE_PRIORITY 0xC928u +#define CP_HQD_QUEUE_PRIORITY 0xC92Cu +#define CP_HQD_QUANTUM 0xC930u #define QUANTUM_EN 1U #define QUANTUM_SCALE_1MS (1U << 4) #define QUANTUM_DURATION(x) ((x) << 8) +#define CP_HQD_PQ_BASE 0xC934 +#define CP_HQD_PQ_BASE_HI 0xC938 +#define CP_HQD_PQ_RPTR 0xC93C +#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940 +#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944 +#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948 +#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C +#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950 +#define DOORBELL_OFFSET(x) ((x) << 2) +#define DOORBELL_OFFSET_MASK (0x1fffff << 2) +#define DOORBELL_SOURCE (1 << 28) +#define DOORBELL_SCHD_HIT (1 << 29) +#define DOORBELL_EN (1 << 30) +#define DOORBELL_HIT (1 << 31) +#define CP_HQD_PQ_WPTR 0xC954 +#define CP_HQD_PQ_CONTROL 0xC958 +#define QUEUE_SIZE(x) ((x) << 0) +#define QUEUE_SIZE_MASK (0x3f << 0) #define RPTR_BLOCK_SIZE(x) ((x) << 8) +#define RPTR_BLOCK_SIZE_MASK (0x3f << 8) #define MIN_AVAIL_SIZE(x) ((x) << 20) +#define PQ_ATC_EN (1 << 23) +#define PQ_VOLATILE (1 << 26) +#define NO_UPDATE_RPTR (1 << 27) +#define UNORD_DISPATCH (1 << 28) +#define ROQ_PQ_IB_FLIP (1 << 29) +#define PRIV_STATE (1 << 30) +#define KMD_QUEUE (1 << 31) + #define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5) #define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3) -#define PQ_ATC_EN (1 << 23) -#define NO_UPDATE_RPTR (1 << 27) +#define CP_HQD_IB_BASE_ADDR 0xC95Cu +#define CP_HQD_IB_BASE_ADDR_HI 0xC960u +#define CP_HQD_IB_RPTR 0xC964u +#define CP_HQD_IB_CONTROL 0xC968u +#define IB_ATC_EN (1U << 23) +#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) -#define DOORBELL_OFFSET(x) ((x) << 2) -#define DOORBELL_EN (1 << 30) +#define CP_HQD_DEQUEUE_REQUEST 0xC974 +#define DEQUEUE_REQUEST_DRAIN 1 +#define DEQUEUE_REQUEST_RESET 2 +#define DEQUEUE_INT (1U << 8) -#define PRIV_STATE (1 << 30) -#define KMD_QUEUE (1 << 31) +#define CP_HQD_SEMA_CMD 0xC97Cu +#define CP_HQD_MSG_TYPE 0xC980u +#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u +#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u +#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu +#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u +#define CP_HQD_HQ_SCHEDULER0 0xC994u +#define CP_HQD_HQ_SCHEDULER1 0xC998u -#define AQL_ENABLE 1 + +#define CP_MQD_CONTROL 0xC99C +#define MQD_VMID(x) ((x) << 0) +#define MQD_VMID_MASK (0xf << 0) +#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) #define GRBM_GFX_INDEX 0x30800 +#define INSTANCE_INDEX(x) ((x) << 0) +#define SH_INDEX(x) ((x) << 8) +#define SE_INDEX(x) ((x) << 16) +#define SH_BROADCAST_WRITES (1 << 29) +#define INSTANCE_BROADCAST_WRITES (1 << 30) +#define SE_BROADCAST_WRITES (1 << 31) +#define SQC_CACHES 0x30d20 +#define SQC_POLICY 0x8C38u +#define SQC_VOLATILE 0x8C3Cu + +#define CP_PERFMON_CNTL 0x36020 + +#define ATC_VMID0_PASID_MAPPING 0x339Cu +#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u #define ATC_VMID_PASID_MAPPING_VALID (1U << 31) +#define ATC_VM_APERTURE0_CNTL 0x3310u +#define ATS_ACCESS_MODE_NEVER 0 +#define ATS_ACCESS_MODE_ALWAYS 1 + +#define ATC_VM_APERTURE0_CNTL2 0x3318u +#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u +#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u +#define ATC_VM_APERTURE1_CNTL 0x3314u +#define ATC_VM_APERTURE1_CNTL2 0x331Cu +#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu +#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u + #endif diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h new file mode 100644 index 000000000000..1880dc0b0fcb --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h @@ -0,0 +1,1377 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#if 0 + HW (CARRIZO) source code for CWSR trap handler + +var G8SR_WDMEM_HWREG_OFFSET = 0 +var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes + +// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. + +var G8SR_DEBUG_TIMESTAMP = 0 +var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset +var s_g8sr_ts_save_s = s[34:35] // save start +var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi +var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ +var s_g8sr_ts_save_d = s[40:41] // save end +var s_g8sr_ts_restore_s = s[42:43] // restore start +var s_g8sr_ts_restore_d = s[44:45] // restore end + +var G8SR_VGPR_SR_IN_DWX4 = 0 +var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes +var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 + + +/*************************************************************************/ +/* control on how to run the shader */ +/*************************************************************************/ +//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) +var EMU_RUN_HACK = 0 +var EMU_RUN_HACK_RESTORE_NORMAL = 0 +var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var SAVE_LDS = 1 +var WG_BASE_ADDR_LO = 0x9000a000 +var WG_BASE_ADDR_HI = 0x0 +var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem +var CTX_SAVE_CONTROL = 0x0 +var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) +var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing + +/**************************************************************************/ +/* variables */ +/**************************************************************************/ +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + + +/* Save */ +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi + + //tba_lo and tba_hi need to be saved/restored +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_status = ttmp4 +var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_save_xnack_mask_lo = ttmp6 +var s_save_xnack_mask_hi = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 + +var s_save_mem_offset = tma_lo +var s_save_alloc_size = s_save_trapsts //conflict +var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) +var s_save_m0 = tma_hi + +/* Restore */ +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi + +var s_restore_mem_offset = ttmp2 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored +var s_restore_mem_offset_save = s_restore_tmp //no conflict + +var s_restore_m0 = s_restore_alloc_size //no conflict + +var s_restore_mode = ttmp7 + +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = tma_lo //no conflict +var s_restore_exec_hi = tma_hi //no conflict +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +var s_restore_xnack_mask_lo = xnack_mask_lo +var s_restore_xnack_mask_hi = xnack_mask_hi +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 + +/**************************************************************************/ +/* trap handler entry points */ +/**************************************************************************/ +/* Shader Main*/ + +shader main + asic(CARRIZO) + type(CS) + + + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually + else + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + end + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE //restore + +L_SKIP_RESTORE: + + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE //this is the operation for save + + // ********* Handle non-CWSR traps ******************* +if (!EMU_RUN_HACK) + /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ + s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 + s_waitcnt lgkmcnt(0) + s_or_b32 ttmp7, ttmp8, ttmp9 + s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set + s_mov_b32 tma_lo, ttmp10 //set tma_lo/hi for next level trap handler + s_mov_b32 tma_hi, ttmp11 + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) + s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler + +L_NO_NEXT_TRAP: + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + s_addc_u32 ttmp1, ttmp1, 0 +L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) + s_rfe_b64 [ttmp0, ttmp1] +end + // ********* End handling of non-CWSR traps ******************* + +/**************************************************************************/ +/* save routine */ +/**************************************************************************/ + +L_SAVE: + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +end + + //check whether there is mem_viol + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_NO_PC_REWIND + + //if so, need rewind PC assuming GDS operation gets NACKed + s_mov_b32 s_save_tmp, 0 //clear mem_viol bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 + s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc + +L_NO_PC_REWIND: + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK + s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_sq_save_msg + s_waitcnt lgkmcnt(0) +end + + if (EMU_RUN_HACK) + + else + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + end + + L_SLEEP: + s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 + + if (EMU_RUN_HACK) + + else + s_cbranch_execz L_SLEEP + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_spi_wrexec + s_waitcnt lgkmcnt(0) +end + + /* setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_save_tmp, v9, 0 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + + + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) + s_mov_b32 s_save_m0, m0 //save M0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 + + + + + /* save HW registers */ + ////////////////////////////// + + L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + + + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO + s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI + end + + write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC + write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) + // Save the tma_lo and tma_hi content from exec_lo and ttmp5 + s_mov_b32 s_save_exec_lo, exec_lo + s_mov_b32 s_save_exec_hi, ttmp5 + write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS + + write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO + write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO + write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //TMA_LO + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) //TMA_HI + + /* the first wave in the threadgroup */ + // save fist_wave bits in tba_hi unused bit.26 + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit + //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] + s_mov_b32 s_save_exec_hi, 0x0 + s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] + + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 + //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 + s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 + s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? + // restore s_save_buf_rsrc0,1 + //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo + s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo + + + + + /* save first 4 VGPR, then LDS save could use */ + // each wave will alloc 4 vgprs at least... + ///////////////////////////////////////////////////////////////////////////////////// + + s_mov_b32 s_save_mem_offset, 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +end + + + + /* save LDS */ + ////////////////////////////// + + L_SAVE_LDS: + + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_barrier //LDS is used? wait for other waves in the same TG + //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + +var LDS_DMA_ENABLE = 0 +var UNROLL = 0 +if UNROLL==0 && LDS_DMA_ENABLE==1 + s_mov_b32 s3, 256*2 + s_nop 0 + s_nop 0 + s_nop 0 + L_SAVE_LDS_LOOP: + //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? + if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + end + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? + +elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss + // store from higest LDS address to lowest + s_mov_b32 s3, 256*2 + s_sub_u32 m0, s_save_alloc_size, s3 + s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 + s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... + s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest + s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction + s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc + s_nop 0 + s_nop 0 + s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes + s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved + s_add_u32 s0, s0,s_save_alloc_size + s_addc_u32 s1, s1, 0 + s_setpc_b64 s[0:1] + + + for var i =0; i< 128; i++ + // be careful to make here a 64Byte aligned address, which could improve performance... + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + + if i!=127 + s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline + s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 + end + end + +else // BUFFER_STORE + v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 + v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid + v_mul_i32_i24 v2, v3, 8 // tid*8 + v_mov_b32 v3, 256*2 + s_mov_b32 m0, 0x10000 + s_mov_b32 s0, s_save_buf_rsrc3 + s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT + +L_SAVE_LDS_LOOP_VECTOR: + ds_read_b64 v[0:1], v2 //x =LDS[a], byte address + s_waitcnt lgkmcnt(0) + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 +// s_waitcnt vmcnt(0) + v_add_u32 v2, vcc[0:1], v2, v3 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size + s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR + + // restore rsrc3 + s_mov_b32 s_save_buf_rsrc3, s0 + +end + +L_SAVE_LDS_DONE: + + + /* save VGPRs - set the Rest VGPRs */ + ////////////////////////////////////////////////////////////////////////////////////// + L_SAVE_VGPR: + // VGPR SR memory offset: 0 + // TODO rearrange the RSRC words to use swizzle for VGPR save... + + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, 4 // skip first 4 VGPRs + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs + + s_set_gpr_idx_on m0, 0x1 // This will change M0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 +L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 // v0 = v[0+m0] + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 4 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +L_SAVE_VGPR_LOOP_END: + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =0 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + + + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later + + L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 //v0 = v[0+m0] + v_mov_b32 v1, v1 //v0 = v[0+m0] + v_mov_b32 v2, v2 //v0 = v[0+m0] + v_mov_b32 v3, v3 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + end + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +end + +L_SAVE_VGPR_END: + + + + + + + /* S_PGM_END_SAVED */ //FIXME graphics ONLY + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_rfe_b64 s_save_pc_lo //Return to the main shader program + else + end + +// Save Done timestamp +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_d + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // Need reset rsrc2?? + s_mov_b32 m0, s_save_mem_offset + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 +end + + + s_branch L_END_PGM + + + +/**************************************************************************/ +/* restore routine */ +/**************************************************************************/ + +L_RESTORE: + /* Setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_restore_tmp, v9, 0 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL + else + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... + s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] + s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. +end + + + + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + + /* global mem offset */ +// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 + + /* the first wave in the threadgroup */ + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ + ////////////////////////////// + L_RESTORE_LDS: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? + + + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + L_RESTORE_LDS_LOOP: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW + end + s_add_u32 m0, m0, 256*2 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? + + + /* restore VGPRs */ + ////////////////////////////// + L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + +if G8SR_VGPR_SR_IN_DWX4 + get_vgpr_size_bytes(s_restore_mem_offset) + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, s_restore_alloc_size + s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 + +L_RESTORE_VGPR_LOOP: + buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) + s_sub_u32 m0, m0, 4 + v_mov_b32 v0, v0 // v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_cmp_eq_u32 m0, 0x8000 + s_cbranch_scc0 L_RESTORE_VGPR_LOOP + s_set_gpr_idx_off + + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes + +else + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 1 + s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later + + L_RESTORE_VGPR_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 + end + s_waitcnt vmcnt(0) //ensure data ready + v_mov_b32 v0, v0 //v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? + s_set_gpr_idx_off + /* VGPR restore on v0 */ + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 + end + +end + + /* restore SGPRs */ + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), + However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG + */ + s_mov_b32 m0, s_restore_alloc_size + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? + + /* restore HW registers */ + ////////////////////////////// + L_RESTORE_HWREG: + + +if G8SR_DEBUG_TIMESTAMP + s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo + s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi +end + + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + + + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS + read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS + read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO + read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE + read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO + read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI + + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + read_hwreg_from_mem(tma_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_lo + read_hwreg_from_mem(tma_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_hi + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + //reuse s_restore_m0 as a temp register + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp + + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + + s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_d + s_waitcnt lgkmcnt(0) +end + +// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc + + +/**************************************************************************/ +/* the END */ +/**************************************************************************/ +L_END_PGM: + s_endpgm + +end + + +/**************************************************************************/ +/* the helper functions */ +/**************************************************************************/ + +//Only for save hwreg to mem +function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:0 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo +end + +//Only for save hwreg to mem +function write_tma_to_mem(s, s_rsrc, offset_imm) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, offset_imm + s_buffer_store_dword s, s_rsrc, m0 glc:0 + s_mov_b32 m0, exec_lo +end + +// HWREG are saved before SGPRs, so all HWREG could be use. +function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) + + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:0 + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:0 + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:0 + s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:0 + s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc +end + + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 +end + + + +function get_lds_size_bytes(s_lds_size_byte) + // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW + s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size + s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +end + +function get_vgpr_size_bytes(s_vgpr_size_byte) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible +end + +function get_sgpr_size_bytes(s_sgpr_size_byte) + s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 + s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) +end + +function get_hwreg_size_bytes + return 128 //HWREG size 128 bytes +end + +#endif + +static const uint32_t cwsr_trap_carrizo_hex[] = { + 0xbf820001, 0xbf820131, + 0xb8f4f802, 0xb8f5f803, + 0x8675ff75, 0x00000400, + 0xbf850013, 0xc00a1e37, + 0x00000000, 0xbf8c007f, + 0x87777978, 0xbf840004, + 0xbeee007a, 0xbeef007b, + 0xb974f802, 0xbe801d78, + 0xb8f5f803, 0x8675ff75, + 0x000001ff, 0xbf850002, + 0x80708470, 0x82718071, + 0x8671ff71, 0x0000ffff, + 0xb974f802, 0xbe801f70, + 0xb8f5f803, 0x8675ff75, + 0x00000100, 0xbf840006, + 0xbefa0080, 0xb97a0203, + 0x8671ff71, 0x0000ffff, + 0x80f08870, 0x82f18071, + 0xbefa0080, 0xb97a0283, + 0xbef60068, 0xbef70069, + 0xb8fa1c07, 0x8e7a9c7a, + 0x87717a71, 0xb8fa03c7, + 0x8e7a9b7a, 0x87717a71, + 0xb8faf807, 0x867aff7a, + 0x00007fff, 0xb97af807, + 0xbef2007e, 0xbef3007f, + 0xbefe0180, 0xbf900004, + 0xbf8e0002, 0xbf88fffe, + 0xbef8007e, 0x8679ff7f, + 0x0000ffff, 0x8779ff79, + 0x00040000, 0xbefa0080, + 0xbefb00ff, 0x00807fac, + 0x867aff7f, 0x08000000, + 0x8f7a837a, 0x877b7a7b, + 0x867aff7f, 0x70000000, + 0x8f7a817a, 0x877b7a7b, + 0xbeef007c, 0xbeee0080, + 0xb8ee2a05, 0x806e816e, + 0x8e6e8a6e, 0xb8fa1605, + 0x807a817a, 0x8e7a867a, + 0x806e7a6e, 0xbefa0084, + 0xbefa00ff, 0x01000000, + 0xbefe007c, 0xbefc006e, + 0xc0601bfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601c3c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601c7c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601cbc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601cfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbef2007e, 0xbef30075, + 0xbefe007c, 0xbefc006e, + 0xc0601d3c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xb8f5f803, 0xbefe007c, + 0xbefc006e, 0xc0601d7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0601dbc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0601dfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xb8eff801, + 0xbefe007c, 0xbefc006e, + 0xc0601bfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601b3c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601b7c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601cbc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0601cfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0x867aff7f, 0x04000000, + 0xbef30080, 0x8773737a, + 0xb8ee2a05, 0x806e816e, + 0x8e6e8a6e, 0xb8f51605, + 0x80758175, 0x8e758475, + 0x8e7a8275, 0xbefa00ff, + 0x01000000, 0xbef60178, + 0x80786e78, 0xbefc0080, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06a003c, 0x00000000, + 0xc06a013c, 0x00000010, + 0xc06a023c, 0x00000020, + 0xc06a033c, 0x00000030, + 0x8078c078, 0x82798079, + 0x807c907c, 0xbf0a757c, + 0xbf85ffeb, 0xbef80176, + 0xbeee0080, 0xbefe00c1, + 0xbeff00c1, 0xbefa00ff, + 0x01000000, 0xe0724000, + 0x6e1e0000, 0xe0724100, + 0x6e1e0100, 0xe0724200, + 0x6e1e0200, 0xe0724300, + 0x6e1e0300, 0xbefe00c1, + 0xbeff00c1, 0xb8f54306, + 0x8675c175, 0xbf84002c, + 0xbf8a0000, 0x867aff73, + 0x04000000, 0xbf840028, + 0x8e758675, 0x8e758275, + 0xbefa0075, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0x806eff6e, 0x00000080, + 0xbefa00ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe80007b, + 0x867bff7b, 0xff7fffff, + 0x877bff7b, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8c007f, 0xe0765000, + 0x6e1e0002, 0x32040702, + 0xd0c9006a, 0x0000eb02, + 0xbf87fff7, 0xbefb0000, + 0xbeee00ff, 0x00000400, + 0xbefe00c1, 0xbeff00c1, + 0xb8f52a05, 0x80758175, + 0x8e758275, 0x8e7a8875, + 0xbefa00ff, 0x01000000, + 0xbefc0084, 0xbf0a757c, + 0xbf840015, 0xbf11017c, + 0x8075ff75, 0x00001000, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x6e1e0000, + 0xe0724100, 0x6e1e0100, + 0xe0724200, 0x6e1e0200, + 0xe0724300, 0x6e1e0300, + 0x807c847c, 0x806eff6e, + 0x00000400, 0xbf0a757c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200d1, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x8676ff7f, + 0x08000000, 0x8f768376, + 0x877b767b, 0x8676ff7f, + 0x70000000, 0x8f768176, + 0x877b767b, 0x8676ff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8f34306, 0x8673c173, + 0xbf840019, 0x8e738673, + 0x8e738273, 0xbefa0073, + 0xb8f22a05, 0x80728172, + 0x8e728a72, 0xb8f61605, + 0x80768176, 0x8e768676, + 0x80727672, 0x8072ff72, + 0x00000080, 0xbefa00ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x721e0000, + 0xe0510100, 0x721e0000, + 0x807cff7c, 0x00000200, + 0x8072ff72, 0x00000200, + 0xbf0a737c, 0xbf85fff6, + 0xbef20080, 0xbefe00c1, + 0xbeff00c1, 0xb8f32a05, + 0x80738173, 0x8e738273, + 0x8e7a8873, 0xbefa00ff, + 0x01000000, 0xbef60072, + 0x8072ff72, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0x8073ff73, 0x00008000, + 0xe0524000, 0x721e0000, + 0xe0524100, 0x721e0100, + 0xe0524200, 0x721e0200, + 0xe0524300, 0x721e0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8072ff72, 0x00000400, + 0xbf0a737c, 0xbf85ffee, + 0xbf9c0000, 0xe0524000, + 0x761e0000, 0xe0524100, + 0x761e0100, 0xe0524200, + 0x761e0200, 0xe0524300, + 0x761e0300, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0x80f2c072, 0xb8f31605, + 0x80738173, 0x8e738473, + 0x8e7a8273, 0xbefa00ff, + 0x01000000, 0xbefc0073, + 0xc031003c, 0x00000072, + 0x80f2c072, 0xbf8c007f, + 0x80fc907c, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff1, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xc0211cfc, + 0x00000072, 0x80728472, + 0xc0211c3c, 0x00000072, + 0x80728472, 0xc0211c7c, + 0x00000072, 0x80728472, + 0xc0211bbc, 0x00000072, + 0x80728472, 0xc0211bfc, + 0x00000072, 0x80728472, + 0xc0211d3c, 0x00000072, + 0x80728472, 0xc0211d7c, + 0x00000072, 0x80728472, + 0xc0211a3c, 0x00000072, + 0x80728472, 0xc0211a7c, + 0x00000072, 0x80728472, + 0xc0211dfc, 0x00000072, + 0x80728472, 0xc0211b3c, + 0x00000072, 0x80728472, + 0xc0211b7c, 0x00000072, + 0x80728472, 0xbf8c007f, + 0x8671ff71, 0x0000ffff, + 0xbefc0073, 0xbefe006e, + 0xbeff006f, 0xc0211bbc, + 0x00000072, 0x80728472, + 0xc0211bfc, 0x00000072, + 0x80728472, 0xbf8c007f, + 0x867375ff, 0x000003ff, + 0xb9734803, 0x867375ff, + 0xfffff800, 0x8f738b73, + 0xb973a2c3, 0xb977f801, + 0x8673ff71, 0xf0000000, + 0x8f739c73, 0x8e739073, + 0xbef60080, 0x87767376, + 0x8673ff71, 0x08000000, + 0x8f739b73, 0x8e738f73, + 0x87767376, 0x8673ff74, + 0x00800000, 0x8f739773, + 0xb976f807, 0x86fe7e7e, + 0x86ea6a6a, 0xb974f802, + 0xbf8a0000, 0x95807370, + 0xbf810000, 0x00000000, +}; + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index ee3e04e10dae..0fe1161a2182 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -31,16 +31,23 @@ #include <uapi/linux/kfd_ioctl.h> #include <linux/time.h> #include <linux/mm.h> -#include <linux/mman.h> +#include <uapi/asm-generic/mman-common.h> #include <asm/processor.h> + #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" +#include "cik_regs.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); static int kfd_mmap(struct file *, struct vm_area_struct *); +static uint32_t kfd_convert_user_mem_alloction_flags( + struct kfd_dev *dev, + uint32_t userspace_flags); +static bool kfd_is_large_bar(struct kfd_dev *dev); +static int kfd_evict(struct file *filep, struct kfd_process *p, void *data); static const char kfd_dev_name[] = "kfd"; static const struct file_operations kfd_fops = { @@ -117,7 +124,7 @@ static int kfd_open(struct inode *inode, struct file *filep) return -EPERM; } - process = kfd_create_process(current); + process = kfd_create_process(filep); if (IS_ERR(process)) return PTR_ERR(process); @@ -206,6 +213,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->ctx_save_restore_area_address = args->ctx_save_restore_address; q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; + q_properties->ctl_stack_size = args->ctl_stack_size; if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) q_properties->type = KFD_QUEUE_TYPE_COMPUTE; @@ -270,7 +278,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, return -EINVAL; } - mutex_lock(&p->mutex); + down_write(&p->lock); pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { @@ -282,8 +290,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, p->pasid, dev->id); - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, - 0, q_properties.type, &queue_id); + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); if (err != 0) goto err_create_queue; @@ -291,10 +298,10 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, /* Return gpu_id as doorbell offset for mmap usage */ - args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); + args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL | args->gpu_id); args->doorbell_offset <<= PAGE_SHIFT; - mutex_unlock(&p->mutex); + up_write(&p->lock); pr_debug("kfd: queue id %d was created successfully\n", args->queue_id); @@ -311,7 +318,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, err_create_queue: err_bind_process: - mutex_unlock(&p->mutex); + up_write(&p->lock); return err; } @@ -325,11 +332,11 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, args->queue_id, p->pasid); - mutex_lock(&p->mutex); + down_write(&p->lock); retval = pqm_destroy_queue(&p->pqm, args->queue_id); - mutex_unlock(&p->mutex); + up_write(&p->lock); return retval; } @@ -371,11 +378,33 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, pr_debug("kfd: updating queue id %d for PASID %d\n", args->queue_id, p->pasid); - mutex_lock(&p->mutex); + down_write(&p->lock); retval = pqm_update_queue(&p->pqm, args->queue_id, &properties); - mutex_unlock(&p->mutex); + up_write(&p->lock); + + return retval; +} + +static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, + void *data) +{ + int retval; + struct kfd_ioctl_set_cu_mask_args *args = data; + struct queue_properties properties; + uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; + + if (get_user(properties.cu_mask, cu_mask_ptr)) + return -EFAULT; + if (properties.cu_mask == 0) + return 0; + + down_write(&p->lock); + + retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); + + up_write(&p->lock); return retval; } @@ -403,7 +432,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, if (dev == NULL) return -EINVAL; - mutex_lock(&p->mutex); + down_write(&p->lock); pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { @@ -427,46 +456,80 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, err = -EINVAL; out: - mutex_unlock(&p->mutex); + up_write(&p->lock); return err; } -static int kfd_ioctl_dbg_register(struct file *filep, - struct kfd_process *p, void *data) +static int kfd_ioctl_set_trap_handler(struct file *filep, + struct kfd_process *p, void *data) { - struct kfd_ioctl_dbg_register_args *args = data; + struct kfd_ioctl_set_trap_handler_args *args = data; struct kfd_dev *dev; - struct kfd_dbgmgr *dbgmgr_ptr; + int err = 0; struct kfd_process_device *pdd; - bool create_ok; - long status = 0; dev = kfd_device_by_id(args->gpu_id); if (dev == NULL) return -EINVAL; - if (dev->device_info->asic_family == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); - return -EINVAL; + down_write(&p->lock); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = -ESRCH; + goto out; + } + if (!dev->cwsr_enabled || !pdd->qpd.cwsr_kaddr) { + pr_err("kfd: CWSR is not enabled, can't set trap handler.\n"); + err = -EINVAL; + goto out; } - mutex_lock(kfd_get_dbgmgr_mutex()); - mutex_lock(&p->mutex); + if (dev->dqm->ops.set_trap_handler(dev->dqm, + &pdd->qpd, + args->tba_addr, + args->tma_addr)) + err = -EINVAL; - /* - * make sure that we have pdd, if this the first queue created for - * this process - */ +out: + up_write(&p->lock); + + return err; +} + +static int +kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data) +{ + long status = -EFAULT; + struct kfd_ioctl_dbg_register_args *args = data; + struct kfd_dev *dev; + struct kfd_dbgmgr *dbgmgr_ptr; + struct kfd_process_device *pdd; + bool create_ok = false; + + pr_debug("kfd:dbg: %s\n", __func__); + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) { + dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__); + return status; + } + + down_write(&p->lock); + mutex_lock(get_dbgmgr_mutex()); + + /* make sure that we have pdd, if this the first queue created for this process */ pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - mutex_unlock(&p->mutex); - mutex_unlock(kfd_get_dbgmgr_mutex()); + if (IS_ERR(pdd) < 0) { + mutex_unlock(get_dbgmgr_mutex()); + up_write(&p->lock); return PTR_ERR(pdd); } if (dev->dbgmgr == NULL) { /* In case of a legal call, we have no dbgmgr yet */ + create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); if (create_ok) { status = kfd_dbgmgr_register(dbgmgr_ptr, p); @@ -475,34 +538,32 @@ static int kfd_ioctl_dbg_register(struct file *filep, else dev->dbgmgr = dbgmgr_ptr; } - } else { - pr_debug("debugger already registered\n"); - status = -EINVAL; } - mutex_unlock(&p->mutex); - mutex_unlock(kfd_get_dbgmgr_mutex()); + mutex_unlock(get_dbgmgr_mutex()); + up_write(&p->lock); return status; } -static int kfd_ioctl_dbg_unrgesiter(struct file *filep, - struct kfd_process *p, void *data) +/* + * Unregister dbg IOCTL + */ + +static int +kfd_ioctl_dbg_unrgesiter(struct file *filep, struct kfd_process *p, void *data) { + long status = -EFAULT; struct kfd_ioctl_dbg_unregister_args *args = data; struct kfd_dev *dev; - long status; dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) - return -EINVAL; - - if (dev->device_info->asic_family == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_unrgesiter not supported on CZ\n"); - return -EINVAL; + if (!dev) { + dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__); + return status; } - mutex_lock(kfd_get_dbgmgr_mutex()); + mutex_lock(get_dbgmgr_mutex()); status = kfd_dbgmgr_unregister(dev->dbgmgr, p); if (status == 0) { @@ -510,7 +571,7 @@ static int kfd_ioctl_dbg_unrgesiter(struct file *filep, dev->dbgmgr = NULL; } - mutex_unlock(kfd_get_dbgmgr_mutex()); + mutex_unlock(get_dbgmgr_mutex()); return status; } @@ -519,125 +580,144 @@ static int kfd_ioctl_dbg_unrgesiter(struct file *filep, * Parse and generate variable size data structure for address watch. * Total size of the buffer and # watch points is limited in order * to prevent kernel abuse. (no bearing to the much smaller HW limitation - * which is enforced by dbgdev module) + * which is enforced by dbgdev module. * please also note that the watch address itself are not "copied from user", * since it be set into the HW in user mode values. * */ -static int kfd_ioctl_dbg_address_watch(struct file *filep, - struct kfd_process *p, void *data) + +static int +kfd_ioctl_dbg_address_watch(struct file *filep, + struct kfd_process *p, + void *data) { + long status = -EFAULT; struct kfd_ioctl_dbg_address_watch_args *args = data; struct kfd_dev *dev; struct dbg_address_watch_info aw_info; - unsigned char *args_buff; - long status; - void __user *cmd_from_user; - uint64_t watch_mask_value = 0; + unsigned char *args_buff = NULL; unsigned int args_idx = 0; + uint64_t watch_mask_value = 0; memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); - dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) - return -EINVAL; + do { + dev = kfd_device_by_id(args->gpu_id); + if (!dev) { + dev_info(NULL, + "Error! kfd: In func %s >> get device by id failed\n", + __func__); + break; + } - if (dev->device_info->asic_family == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); - return -EINVAL; - } + if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) { + status = -EINVAL; + break; + } - cmd_from_user = (void __user *) args->content_ptr; + if (args->buf_size_in_bytes <= sizeof(*args)) { + status = -EINVAL; + break; + } - /* Validate arguments */ + /* this is the actual buffer to work with */ - if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || - (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || - (cmd_from_user == NULL)) - return -EINVAL; + args_buff = kzalloc(args->buf_size_in_bytes - + sizeof(*args), GFP_KERNEL); + if (args_buff == NULL) { + status = -ENOMEM; + break; + } - /* this is the actual buffer to work with */ - args_buff = memdup_user(cmd_from_user, - args->buf_size_in_bytes - sizeof(*args)); - if (IS_ERR(args_buff)) - return PTR_ERR(args_buff); + /* this is the actual buffer to work with */ + args_buff = memdup_user(cmd_from_user, + args->buf_size_in_bytes - sizeof(*args)); + if (IS_ERR(args_buff)) + return PTR_ERR(args_buff); - aw_info.process = p; + aw_info.process = p; - aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); - args_idx += sizeof(aw_info.num_watch_points); + aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); + args_idx += sizeof(aw_info.num_watch_points); - aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; - args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; + aw_info.watch_mode = (HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; + args_idx += sizeof(HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; - /* - * set watch address base pointer to point on the array base - * within args_buff - */ - aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; + /* set watch address base pointer to point on the array base within args_buff */ - /* skip over the addresses buffer */ - args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; + aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; - if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { - kfree(args_buff); - return -EINVAL; - } + /*skip over the addresses buffer */ + args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; - watch_mask_value = (uint64_t) args_buff[args_idx]; + if (args_idx >= args->buf_size_in_bytes) { + status = -EINVAL; + break; + } - if (watch_mask_value > 0) { - /* - * There is an array of masks. - * set watch mask base pointer to point on the array base - * within args_buff - */ - aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; + watch_mask_value = (uint64_t) args_buff[args_idx]; - /* skip over the masks buffer */ - args_idx += sizeof(aw_info.watch_mask) * - aw_info.num_watch_points; - } else { - /* just the NULL mask, set to NULL and skip over it */ - aw_info.watch_mask = NULL; - args_idx += sizeof(aw_info.watch_mask); - } + if (watch_mask_value > 0) { + /* there is an array of masks */ - if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { - kfree(args_buff); - return -EINVAL; - } + /* set watch mask base pointer to point on the array base within args_buff */ + aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; - /* Currently HSA Event is not supported for DBG */ - aw_info.watch_event = NULL; + /*skip over the masks buffer */ + args_idx += sizeof(aw_info.watch_mask) * aw_info.num_watch_points; + } - mutex_lock(kfd_get_dbgmgr_mutex()); + else + /* just the NULL mask, set to NULL and skip over it */ + { + aw_info.watch_mask = NULL; + args_idx += sizeof(aw_info.watch_mask); + } + + if (args_idx > args->buf_size_in_bytes) { + status = -EINVAL; + break; + } - status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); + aw_info.watch_event = NULL; /* Currently HSA Event is not supported for DBG */ + status = 0; - mutex_unlock(kfd_get_dbgmgr_mutex()); + } while (0); + + if (status == 0) { + mutex_lock(get_dbgmgr_mutex()); + + status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); + + mutex_unlock(get_dbgmgr_mutex()); + + } kfree(args_buff); return status; } -/* Parse and generate fixed size data structure for wave control */ -static int kfd_ioctl_dbg_wave_control(struct file *filep, - struct kfd_process *p, void *data) +/* + * Parse and generate fixed size data structure for wave control. + * Buffer is generated in a "packed" form, for avoiding structure packing/pending dependencies. + */ + +static int +kfd_ioctl_dbg_wave_control(struct file *filep, struct kfd_process *p, void *data) { + long status = -EFAULT; struct kfd_ioctl_dbg_wave_control_args *args = data; struct kfd_dev *dev; struct dbg_wave_control_info wac_info; - unsigned char *args_buff; - uint32_t computed_buff_size; - long status; - void __user *cmd_from_user; + unsigned char *args_buff = NULL; unsigned int args_idx = 0; + uint32_t computed_buff_size; memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info)); /* we use compact form, independent of the packing attribute value */ + computed_buff_size = sizeof(*args) + sizeof(wac_info.mode) + sizeof(wac_info.operand) + @@ -645,26 +725,25 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, sizeof(wac_info.dbgWave_msg.MemoryVA) + sizeof(wac_info.trapId); - dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) - return -EINVAL; - if (dev->device_info->asic_family == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); - return -EINVAL; - } + dev_info(NULL, "kfd: In func %s - start\n", __func__); - /* input size must match the computed "compact" size */ - if (args->buf_size_in_bytes != computed_buff_size) { - pr_debug("size mismatch, computed : actual %u : %u\n", - args->buf_size_in_bytes, computed_buff_size); - return -EINVAL; - } + do { + dev = kfd_device_by_id(args->gpu_id); + if (!dev) { + dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__); + break; + } - cmd_from_user = (void __user *) args->content_ptr; + /* input size must match the computed "compact" size */ - if (cmd_from_user == NULL) - return -EINVAL; + if (args->buf_size_in_bytes != computed_buff_size) { + dev_info(NULL, + "Error! kfd: In func %s >> size mismatch, computed : actual %u : %u\n", + __func__, args->buf_size_in_bytes, computed_buff_size); + status = -EINVAL; + break; + } /* copy the entire buffer from user */ @@ -673,34 +752,51 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, if (IS_ERR(args_buff)) return PTR_ERR(args_buff); - /* move ptr to the start of the "pay-load" area */ - wac_info.process = p; + if (copy_from_user(args_buff, + (void __user *) args->content_ptr, + args->buf_size_in_bytes - sizeof(*args))) { + dev_info(NULL, + "Error! kfd: In func %s >> copy_from_user failed\n", + __func__); + break; + } + + /* move ptr to the start of the "pay-load" area */ + - wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx])); - args_idx += sizeof(wac_info.operand); + wac_info.process = p; - wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); - args_idx += sizeof(wac_info.mode); + wac_info.operand = (HSA_DBG_WAVEOP) *((HSA_DBG_WAVEOP *)(&args_buff[args_idx])); + args_idx += sizeof(wac_info.operand); - wac_info.trapId = *((uint32_t *)(&args_buff[args_idx])); - args_idx += sizeof(wac_info.trapId); + wac_info.mode = (HSA_DBG_WAVEMODE) *((HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); + args_idx += sizeof(wac_info.mode); - wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = - *((uint32_t *)(&args_buff[args_idx])); - wac_info.dbgWave_msg.MemoryVA = NULL; + wac_info.trapId = (uint32_t) *((uint32_t *)(&args_buff[args_idx])); + args_idx += sizeof(wac_info.trapId); - mutex_lock(kfd_get_dbgmgr_mutex()); + wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = *((uint32_t *)(&args_buff[args_idx])); + wac_info.dbgWave_msg.MemoryVA = NULL; - pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", - wac_info.process, wac_info.operand, - wac_info.mode, wac_info.trapId, - wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); - status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); + status = 0; + + } while (0); + if (status == 0) { + mutex_lock(get_dbgmgr_mutex()); + + dev_info(NULL, + "kfd: In func %s >> calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", + __func__, wac_info.process, wac_info.operand, wac_info.mode, wac_info.trapId, + wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); - pr_debug("Returned status of dbg manager is %ld\n", status); + status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); - mutex_unlock(kfd_get_dbgmgr_mutex()); + dev_info(NULL, "kfd: In func %s >> returned status of dbg manager is %ld\n", __func__, status); + + mutex_unlock(get_dbgmgr_mutex()); + + } kfree(args_buff); @@ -715,12 +811,13 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, struct timespec64 time; dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) - return -EINVAL; - - /* Reading GPU clock counter from KGD */ - args->gpu_clock_counter = - dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); + if (dev) + /* Reading GPU clock counter from KGD */ + args->gpu_clock_counter = + dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); + else + /* Node without GPU resource */ + args->gpu_clock_counter = 0; /* No access to rdtsc. Using raw monotonic time */ getrawmonotonic64(&time); @@ -747,7 +844,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, args->num_of_nodes = 0; - mutex_lock(&p->mutex); + down_write(&p->lock); /*if the process-device list isn't empty*/ if (kfd_has_process_device_data(p)) { @@ -786,52 +883,180 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); } - mutex_unlock(&p->mutex); + up_write(&p->lock); return 0; } -static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, - void *data) +static int kfd_ioctl_get_process_apertures_new(struct file *filp, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_process_apertures_new_args *args = data; + struct kfd_process_device_apertures *pa; + struct kfd_process_device *pdd; + uint32_t nodes = 0; + int ret; + + dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); + + if (args->num_of_nodes == 0) { + /* Return number of nodes, so that user space can alloacate + * sufficient memory */ + down_write(&p->lock); + + if (!kfd_has_process_device_data(p)) { + up_write(&p->lock); + return 0; + } + + /* Run over all pdd of the process */ + pdd = kfd_get_first_process_device_data(p); + do { + args->num_of_nodes++; + } while ((pdd = + kfd_get_next_process_device_data(p, pdd)) != NULL); + + up_write(&p->lock); + return 0; + } + + /* Fill in process-aperture information for all available + * nodes, but not more than args->num_of_nodes as that is + * the amount of memory allocated by user */ + pa = kzalloc((sizeof(struct kfd_process_device_apertures) * + args->num_of_nodes), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + down_write(&p->lock); + + if (!kfd_has_process_device_data(p)) { + up_write(&p->lock); + args->num_of_nodes = 0; + kfree(pa); + return 0; + } + + /* Run over all pdd of the process */ + pdd = kfd_get_first_process_device_data(p); + do { + pa[nodes].gpu_id = pdd->dev->id; + pa[nodes].lds_base = pdd->lds_base; + pa[nodes].lds_limit = pdd->lds_limit; + pa[nodes].gpuvm_base = pdd->gpuvm_base; + pa[nodes].gpuvm_limit = pdd->gpuvm_limit; + pa[nodes].scratch_base = pdd->scratch_base; + pa[nodes].scratch_limit = pdd->scratch_limit; + + dev_dbg(kfd_device, + "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, + "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, + "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, + "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, + "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, + "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, + "scratch_limit %llX\n", pdd->scratch_limit); + nodes++; + } while ( + (pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && + (nodes < args->num_of_nodes)); + up_write(&p->lock); + + args->num_of_nodes = nodes; + ret = copy_to_user( + (void __user *)args->kfd_process_device_apertures_ptr, + pa, + (nodes * sizeof(struct kfd_process_device_apertures))); + kfree(pa); + return ret ? -EFAULT : 0; +} + +static int +kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_create_event_args *args = data; - int err; + struct kfd_dev *kfd; + struct kfd_process_device *pdd; + int err = -EINVAL; + void *mem, *kern_addr = NULL; - err = kfd_event_create(filp, p, args->event_type, - args->auto_reset != 0, args->node_id, - &args->event_id, &args->event_trigger_data, - &args->event_page_offset, - &args->event_slot_index); + pr_debug("amdkfd: Event page offset 0x%llx\n", args->event_page_offset); + + if (args->event_page_offset) { + kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); + if (!kfd) { + pr_err("amdkfd: can't find kfd device\n"); + return -EFAULT; + } + if (KFD_IS_DGPU(kfd->device_info->asic_family)) { + down_write(&p->lock); + pdd = kfd_bind_process_to_device(kfd, p); + if (IS_ERR(pdd) < 0) { + err = PTR_ERR(pdd); + up_write(&p->lock); + return -EFAULT; + } + mem = kfd_process_device_translate_handle(pdd, + GET_IDR_HANDLE(args->event_page_offset)); + if (!mem) { + pr_err("amdkfd: can't find BO offset is 0x%llx\n", + args->event_page_offset); + up_write(&p->lock); + return -EFAULT; + } + up_write(&p->lock); + + /* Map dGPU gtt BO to kernel */ + kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, + mem, &kern_addr); + } + } + + err = kfd_event_create(filp, p, + args->event_type, + args->auto_reset != 0, + args->node_id, + &args->event_id, + &args->event_trigger_data, + &args->event_page_offset, + &args->event_slot_index, + kern_addr); return err; } -static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, - void *data) +static int +kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_destroy_event_args *args = data; return kfd_event_destroy(p, args->event_id); } -static int kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, - void *data) +static int +kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_set_event_args *args = data; return kfd_set_event(p, args->event_id); } -static int kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, - void *data) +static int +kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_reset_event_args *args = data; return kfd_reset_event(p, args->event_id); } -static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, - void *data) +static int +kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_wait_events_args *args = data; enum kfd_event_wait_result wait_result; @@ -846,6 +1071,711 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, return err; } +static int kfd_ioctl_alloc_scratch_memory(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_alloc_memory_of_gpu_args *args = + (struct kfd_ioctl_alloc_memory_of_gpu_args *)data; + struct kfd_process_device *pdd; + struct kfd_dev *dev; + long err; + + if (args->size == 0) + return -EINVAL; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + down_write(&p->lock); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd) < 0) { + err = PTR_ERR(pdd); + goto bind_process_to_device_fail; + } + + pdd->sh_hidden_private_base_vmid = args->va_addr; + pdd->qpd.sh_hidden_private_base = args->va_addr; + + up_write(&p->lock); + + if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0) { + err = dev->kfd2kgd->alloc_memory_of_scratch( + dev->kgd, args->va_addr, pdd->qpd.vmid); + if (err != 0) + goto alloc_memory_of_scratch_failed; + } + + return 0; + +bind_process_to_device_fail: + up_write(&p->lock); +alloc_memory_of_scratch_failed: + return -EFAULT; +} + +static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; + struct kfd_process_device *pdd; + void *mem; + struct kfd_dev *dev; + int idr_handle; + long err; + + if (args->size == 0) + return -EINVAL; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + down_write(&p->lock); + pdd = kfd_bind_process_to_device(dev, p); + up_write(&p->lock); + if (IS_ERR(pdd) < 0) + return PTR_ERR(pdd); + + err = dev->kfd2kgd->alloc_memory_of_gpu( + dev->kgd, args->va_addr, args->size, + pdd->vm, (struct kgd_mem **) &mem, NULL, NULL, pdd, 0); + + if (err != 0) + return err; + + down_write(&p->lock); + idr_handle = kfd_process_device_create_obj_handle(pdd, mem, + args->va_addr, args->size); + up_write(&p->lock); + if (idr_handle < 0) { + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, + (struct kgd_mem *) mem); + return -EFAULT; + } + + args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); + + return 0; +} + +bool kfd_is_large_bar(struct kfd_dev *dev) +{ + struct kfd_local_mem_info mem_info; + + if (debug_largebar) { + pr_debug("amdkfd: simulate large-bar allocation on non large-bar machine\n"); + return true; + } + + if (!KFD_IS_DGPU(dev->device_info->asic_family)) + return false; + + dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); + if (mem_info.local_mem_size_private == 0 && + mem_info.local_mem_size_public > 0) + return true; + return false; +} + +static uint32_t kfd_convert_user_mem_alloction_flags( + struct kfd_dev *dev, + uint32_t userspace_flags) +{ + uint32_t kernel_allocation_flags; + + kernel_allocation_flags = 0; + + /* Allocate VRAM bo */ + if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) || + (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE)) { + kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM; + if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) && + kfd_is_large_bar(dev)) + kernel_allocation_flags |= ALLOC_MEM_FLAGS_PUBLIC; + goto out; + } + /* + * Since currently user space library doesn't uses scratch + * allocation flag I route it to VRAM + */ + if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_SCRATCH) || + (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_SCRATCH)) { + kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM; + goto out; + } + /* + * The current usage for *_HOST allocation flags are for GTT memory + * Need to verify if we're node zero or we want to allocate bo on + * public domain for P2P buffers. + */ + if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST) { + kernel_allocation_flags = ALLOC_MEM_FLAGS_GTT; + goto out; + } + /* Allocate userptr BO */ + if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { + kernel_allocation_flags = ALLOC_MEM_FLAGS_USERPTR; + goto out; + } + +out: + if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM) + kernel_allocation_flags |= ALLOC_MEM_FLAGS_AQL_QUEUE_MEM; + /* Current HW doesn't support non paged memory */ + kernel_allocation_flags |= ALLOC_MEM_FLAGS_NONPAGED; + /* + * Set by default execute access as this buffer might be allocated + * for CP's ring buffer + */ + kernel_allocation_flags |= ALLOC_MEM_FLAGS_EXECUTE_ACCESS; + kernel_allocation_flags |= ALLOC_MEM_FLAGS_NO_SUBSTITUTE; + + pr_debug("amdkfd: user allocation flags 0x%x kernel allocation flags: 0x%x\n", + userspace_flags, kernel_allocation_flags); + + return kernel_allocation_flags; +} + +static int kfd_ioctl_alloc_memory_of_gpu_new(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_alloc_memory_of_gpu_new_args *args = data; + struct kfd_process_device *pdd; + void *mem; + struct kfd_dev *dev; + int idr_handle; + long err; + uint64_t offset; + + if (args->size == 0) + return -EINVAL; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + down_write(&p->lock); + pdd = kfd_bind_process_to_device(dev, p); + up_write(&p->lock); + if (IS_ERR(pdd) < 0) + return PTR_ERR(pdd); + + offset = args->mmap_offset; + err = dev->kfd2kgd->alloc_memory_of_gpu( + dev->kgd, args->va_addr, args->size, + pdd->vm, (struct kgd_mem **) &mem, &offset, + NULL, pdd, + kfd_convert_user_mem_alloction_flags(dev, args->flags)); + + if (err != 0) + return err; + + down_write(&p->lock); + idr_handle = kfd_process_device_create_obj_handle(pdd, mem, + args->va_addr, args->size); + up_write(&p->lock); + if (idr_handle < 0) { + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, + (struct kgd_mem *) mem); + return -EFAULT; + } + + args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); + if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) != 0 && + !kfd_is_large_bar(dev)) { + args->mmap_offset = 0; + } else { + args->mmap_offset = KFD_MMAP_TYPE_MAP_BO; + args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id); + args->mmap_offset <<= PAGE_SHIFT; + args->mmap_offset |= offset; + } + + return 0; +} + +static int kfd_ioctl_free_memory_of_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_free_memory_of_gpu_args *args = data; + struct kfd_process_device *pdd; + struct kfd_bo *buf_obj; + struct kfd_dev *dev; + int ret; + + dev = kfd_device_by_id(GET_GPU_ID(args->handle)); + if (dev == NULL) + return -EINVAL; + + down_write(&p->lock); + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + ret = -EINVAL; + goto err_unlock; + } + + buf_obj = kfd_process_device_find_bo(pdd, + GET_IDR_HANDLE(args->handle)); + if (buf_obj == NULL) { + ret = -EINVAL; + goto err_unlock; + } + run_rdma_free_callback(buf_obj); + + up_write(&p->lock); + + ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem); + + /* If freeing the buffer failed, leave the handle in place for + * clean-up during process tear-down. */ + if (ret == 0) { + down_write(&p->lock); + kfd_process_device_remove_obj_handle( + pdd, GET_IDR_HANDLE(args->handle)); + up_write(&p->lock); + } + + return ret; + +err_unlock: + up_write(&p->lock); + return ret; +} + +int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem, + struct kfd_process *p, struct kfd_process_device *pdd) +{ + int err; + + BUG_ON(!dev); + BUG_ON(!pdd); + + err = dev->kfd2kgd->map_memory_to_gpu( + dev->kgd, (struct kgd_mem *) mem, pdd->vm); + + if (err != 0) + return err; + + radeon_flush_tlb(dev, p->pasid); + + err = dev->dqm->ops.set_page_directory_base(dev->dqm, &pdd->qpd); + if (err != 0) { + dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, + (struct kgd_mem *) mem, pdd->vm); + return err; + } + + return 0; +} + +static int kfd_ioctl_map_memory_to_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_map_memory_to_gpu_new_args *args = data; + struct kfd_process_device *pdd, *peer_pdd; + void *mem; + struct kfd_dev *dev, *peer; + long err = 0; + int i, num_dev; + uint32_t *devices_arr = NULL; + int bo_size; + + dev = kfd_device_by_id(GET_GPU_ID(args->handle)); + if (dev == NULL) + return -EINVAL; + + if (args->device_ids_array_size > 0 && + (args->device_ids_array_size < sizeof(uint32_t))) { + pr_err("amdkfd: err node IDs array size %u\n", + args->device_ids_array_size); + return -EFAULT; + } + + if (args->device_ids_array_size > 0) { + devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); + if (!devices_arr) + return -ENOMEM; + + err = copy_from_user(devices_arr, + (void __user *)args->device_ids_array, + args->device_ids_array_size); + if (err != 0) { + err = -EFAULT; + goto copy_from_user_failed; + } + } + + down_write(&p->lock); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd) < 0) { + err = PTR_ERR(pdd); + goto bind_process_to_device_failed; + } + + mem = kfd_process_device_translate_handle(pdd, + GET_IDR_HANDLE(args->handle)); + up_write(&p->lock); + + if (mem == NULL) { + err = PTR_ERR(mem); + goto get_mem_obj_from_handle_failed; + } + + if (args->device_ids_array_size > 0) { + num_dev = args->device_ids_array_size / sizeof(uint32_t); + for (i = 0 ; i < num_dev; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (!peer) { + pr_err("amdkfd: didn't found kfd-dev for 0x%x\n", + devices_arr[i]); + err = -EFAULT; + goto get_mem_obj_from_handle_failed; + } + down_write(&p->lock); + peer_pdd = kfd_bind_process_to_device(peer, p); + up_write(&p->lock); + if (!peer_pdd) { + err = -EFAULT; + goto get_mem_obj_from_handle_failed; + } + err = kfd_map_memory_to_gpu(peer, mem, p, peer_pdd); + if (err != 0) + pr_err("amdkfd: failed to map\n"); + } + } else { + err = kfd_map_memory_to_gpu(dev, mem, p, pdd); + if (err != 0) + pr_err("amdkfd: failed to map\n"); + } + + bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem); + down_write(&p->lock); + pdd->mapped_size += bo_size; + up_write(&p->lock); + + if (args->device_ids_array_size > 0 && devices_arr) + kfree(devices_arr); + + return err; + +bind_process_to_device_failed: + up_write(&p->lock); +get_mem_obj_from_handle_failed: +copy_from_user_failed: + kfree(devices_arr); + return err; +} + +static int kfd_ioctl_map_memory_to_gpu_wrapper(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_map_memory_to_gpu_args *args = data; + struct kfd_ioctl_map_memory_to_gpu_new_args new_args; + + new_args.handle = args->handle; + new_args.device_ids_array = NULL; + new_args.device_ids_array_size = 0; + + return kfd_ioctl_map_memory_to_gpu(filep, p, &new_args); +} + +static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_unmap_memory_from_gpu_new_args *args = data; + struct kfd_process_device *pdd, *peer_pdd; + void *mem; + struct kfd_dev *dev, *peer; + long err = 0; + uint32_t *devices_arr = NULL, num_dev, i; + int bo_size; + + dev = kfd_device_by_id(GET_GPU_ID(args->handle)); + if (dev == NULL) + return -EINVAL; + + if (args->device_ids_array_size > 0 && + (args->device_ids_array_size < sizeof(uint32_t))) { + pr_err("amdkfd: err node IDs array size %u\n", + args->device_ids_array_size); + return -EFAULT; + } + + if (args->device_ids_array_size > 0) { + devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); + if (!devices_arr) + return -ENOMEM; + + err = copy_from_user(devices_arr, + (void __user *)args->device_ids_array, + args->device_ids_array_size); + if (err != 0) { + err = -EFAULT; + goto copy_from_user_failed; + } + } + + down_write(&p->lock); + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + err = PTR_ERR(pdd); + goto bind_process_to_device_failed; + } + + mem = kfd_process_device_translate_handle(pdd, + GET_IDR_HANDLE(args->handle)); + up_write(&p->lock); + + if (mem == NULL) { + err = PTR_ERR(mem); + goto get_mem_obj_from_handle_failed; + } + + if (args->device_ids_array_size > 0) { + num_dev = args->device_ids_array_size / sizeof(uint32_t); + for (i = 0 ; i < num_dev; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (!peer) { + err = -EFAULT; + goto get_mem_obj_from_handle_failed; + } + down_write(&p->lock); + peer_pdd = kfd_get_process_device_data(peer, p); + up_write(&p->lock); + if (!peer_pdd) { + err = -EFAULT; + goto get_mem_obj_from_handle_failed; + } + peer->kfd2kgd->unmap_memory_to_gpu(peer->kgd, + mem, peer_pdd->vm); + radeon_flush_tlb(peer, p->pasid); + } + } else { + dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm); + radeon_flush_tlb(dev, p->pasid); + } + + bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem); + down_write(&p->lock); + pdd->mapped_size -= bo_size; + up_write(&p->lock); + + return 0; + +bind_process_to_device_failed: + up_write(&p->lock); +get_mem_obj_from_handle_failed: +copy_from_user_failed: + kfree(devices_arr); + return err; +} + +static int kfd_ioctl_unmap_memory_from_gpu_wrapper(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; + struct kfd_ioctl_unmap_memory_from_gpu_new_args new_args; + + new_args.handle = args->handle; + new_args.device_ids_array = NULL; + new_args.device_ids_array_size = 0; + + return kfd_ioctl_unmap_memory_from_gpu(filep, p, &new_args); +} + +static int kfd_ioctl_open_graphic_handle(struct file *filep, + struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_open_graphic_handle_args *args = data; + struct kfd_dev *dev; + struct kfd_process_device *pdd; + void *mem; + int idr_handle; + long err; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + if (dev->device_info->asic_family != CHIP_KAVERI) { + pr_debug("kfd_ioctl_open_graphic_handle only supported on KV\n"); + return -EINVAL; + } + + down_write(&p->lock); + pdd = kfd_bind_process_to_device(dev, p); + up_write(&p->lock); + if (IS_ERR(pdd) < 0) + return PTR_ERR(pdd); + + err = dev->kfd2kgd->open_graphic_handle(dev->kgd, + args->va_addr, + (struct kgd_vm *) pdd->vm, + args->graphic_device_fd, + args->graphic_handle, + (struct kgd_mem **) &mem); + + if (err != 0) + return err; + + down_write(&p->lock); + /*TODO: When open_graphic_handle is implemented, we need to create + * the corresponding interval tree. We need to know the size of + * the buffer through open_graphic_handle(). We use 1 for now.*/ + idr_handle = kfd_process_device_create_obj_handle(pdd, mem, + args->va_addr, 1); + up_write(&p->lock); + if (idr_handle < 0) { + /* FIXME: destroy_process_gpumem doesn't seem to be + * implemented anywhere */ + dev->kfd2kgd->destroy_process_gpumem(dev->kgd, mem); + return -EFAULT; + } + + args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); + + return 0; +} + +static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; + struct kfd_dev *dev; + struct kfd_process_device *pdd; + long err; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + down_write(&p->lock); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd) < 0) { + err = PTR_ERR(pdd); + goto exit; + } + + err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, + args->dgpu_limit); + +exit: + up_write(&p->lock); + return err; +} + +static int kfd_ioctl_get_dmabuf_info(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_dmabuf_info_args *args = data; + struct kfd_dev *dev = NULL; + struct kgd_dev *dma_buf_kgd; + void *metadata_buffer = NULL; + uint32_t flags; + unsigned i; + int r; + + /* Find a KFD GPU device that supports the get_dmabuf_info query */ + for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) + if (dev && dev->kfd2kgd->get_dmabuf_info) + break; + if (!dev) + return -EINVAL; + + if (args->metadata_ptr) { + metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); + if (!metadata_buffer) + return -ENOMEM; + } + + /* Get dmabuf info from KGD */ + r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, + &dma_buf_kgd, &args->size, + metadata_buffer, args->metadata_size, + &args->metadata_size, &flags); + if (r) + goto exit; + + /* Reverse-lookup gpu_id from kgd pointer */ + dev = kfd_device_by_kgd(dma_buf_kgd); + if (!dev) { + r = -EINVAL; + goto exit; + } + args->gpu_id = kfd_get_gpu_id(dev); + + /* Translate flags */ + if (flags & ALLOC_MEM_FLAGS_VRAM) { + args->flags = KFD_IS_DGPU(dev->device_info->asic_family) ? + KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE : + KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE; + } else + args->flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST; + + /* Copy metadata buffer to user mode */ + if (metadata_buffer) { + r = copy_to_user((void __user *)args->metadata_ptr, + metadata_buffer, args->metadata_size); + if (r != 0) + r = -EFAULT; + } + +exit: + kfree(metadata_buffer); + + return r; +} + +static int kfd_ioctl_import_dmabuf(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_import_dmabuf_args *args = data; + struct kfd_dev *dev; + struct kfd_process_device *pdd; + void *mem; + uint64_t size; + int idr_handle; + int r; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev || !dev->kfd2kgd->import_dmabuf) + return -EINVAL; + + down_write(&p->lock); + pdd = kfd_bind_process_to_device(dev, p); + up_write(&p->lock); + if (IS_ERR(pdd) < 0) + return PTR_ERR(pdd); + + r = dev->kfd2kgd->import_dmabuf(dev->kgd, args->dmabuf_fd, + args->va_addr, pdd->vm, + (struct kgd_mem **)&mem, &size); + if (r) + return r; + + down_write(&p->lock); + idr_handle = kfd_process_device_create_obj_handle(pdd, mem, + args->va_addr, size); + up_write(&p->lock); + if (idr_handle < 0) { + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, + (struct kgd_mem *)mem); + return -EFAULT; + } + + args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); + + return 0; +} #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl} @@ -899,10 +1829,65 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, kfd_ioctl_dbg_wave_control, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, + kfd_ioctl_alloc_memory_of_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, + kfd_ioctl_free_memory_of_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, + kfd_ioctl_map_memory_to_gpu_wrapper, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, + kfd_ioctl_unmap_memory_from_gpu_wrapper, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_OPEN_GRAPHIC_HANDLE, + kfd_ioctl_open_graphic_handle, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, + kfd_ioctl_alloc_scratch_memory, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, + kfd_ioctl_set_cu_mask, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, + kfd_ioctl_set_process_dgpu_aperture, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, + kfd_ioctl_set_trap_handler, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU_NEW, + kfd_ioctl_alloc_memory_of_gpu_new, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, + kfd_ioctl_map_memory_to_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, + kfd_ioctl_unmap_memory_from_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, + kfd_ioctl_get_process_apertures_new, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_EVICT_MEMORY, + kfd_evict, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, + kfd_ioctl_get_dmabuf_info, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, + kfd_ioctl_import_dmabuf, 0) }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) +static int kfd_evict(struct file *filep, struct kfd_process *p, void *data) +{ + struct kfd_ioctl_eviction_args *args = data; + + return evict_size(p, args->size, args->type); + +} static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct kfd_process *process; @@ -994,20 +1979,37 @@ err_i1: static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) { struct kfd_process *process; + struct kfd_dev *kfd; + unsigned long vm_pgoff; + int retval; process = kfd_get_process(current); if (IS_ERR(process)) return PTR_ERR(process); - if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == - KFD_MMAP_DOORBELL_MASK) { - vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; + vm_pgoff = vma->vm_pgoff; + vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); + + switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { + case KFD_MMAP_TYPE_DOORBELL: return kfd_doorbell_mmap(process, vma); - } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == - KFD_MMAP_EVENTS_MASK) { - vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; + + case KFD_MMAP_TYPE_EVENTS: return kfd_event_mmap(process, vma); + + case KFD_MMAP_TYPE_MAP_BO: + kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); + if (!kfd) + return -EFAULT; + retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma); + return retval; + + case KFD_MMAP_TYPE_RESERVED_MEM: + return kfd_reserved_mem_mmap(process, vma); + } return -EFAULT; } + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c new file mode 100644 index 000000000000..b3d4a506b0e6 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -0,0 +1,1163 @@ +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/mm.h> +#include <linux/amd-iommu.h> +#include <linux/pci.h> +#include "kfd_crat.h" +#include "kfd_priv.h" +#include "kfd_topology.h" + +/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. + * GPU processor ID are expressed with Bit[31]=1. + * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs + * used in the CRAT. */ +static uint32_t gpu_processor_id_low = 0x80001000; + +/* Return the next available gpu_processor_id and increment it for next GPU + * @total_cu_count - Total CUs present in the GPU including ones masked off + */ +static inline unsigned int get_and_inc_gpu_processor_id( + unsigned int total_cu_count) +{ + int current_id = gpu_processor_id_low; + + gpu_processor_id_low += total_cu_count; + return current_id; +} + +/* Static table to describe GPU Cache information */ +struct kfd_gpu_cache_info { + uint32_t cache_size; + uint32_t cache_level; + uint32_t flags; + /* Indicates how many Compute Units share this cache + * Value = 1 indicates the cache is not shared */ + uint32_t num_cu_shared; +}; + +static struct kfd_gpu_cache_info kaveri_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + + }, + { + /* Scalar L1 Instruction Cache (in SQC module) per bank */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* Scalar L1 Data Cache (in SQC module) per bank */ + .cache_size = 8, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + + /* TODO: Add L2 Cache information */ +}; + + +static struct kfd_gpu_cache_info carrizo_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + }, + { + /* Scalar L1 Instruction Cache (in SQC module) per bank */ + .cache_size = 8, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 4, + }, + { + /* Scalar L1 Data Cache (in SQC module) per bank. */ + .cache_size = 4, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 4, + }, + + /* TODO: Add L2 Cache information */ +}; + +/* NOTE: In future if more information is added to struct kfd_gpu_cache_info + * the following ASICs may need a separate table. */ +#define tonga_cache_info carrizo_cache_info +#define fiji_cache_info carrizo_cache_info + +static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.cpu_cores_count = cu->num_cpu_cores; + dev->node_props.cpu_core_id_base = cu->processor_id_low; + if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, + cu->processor_id_low); +} + +static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.simd_id_base = cu->processor_id_low; + dev->node_props.simd_count = cu->num_simd_cores; + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; + dev->node_props.array_count = cu->array_count; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; + if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) + dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; + pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); +} + +/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, + struct list_head *device_list) +{ + struct kfd_topology_device *dev; + + BUG_ON(!cu); + + pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", + cu->proximity_domain, cu->hsa_capability); + list_for_each_entry(dev, device_list, list) { + if (cu->proximity_domain == dev->proximity_domain) { + if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) + kfd_populated_cu_info_cpu(dev, cu); + + if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) + kfd_populated_cu_info_gpu(dev, cu); + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, + struct list_head *device_list) +{ + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; + + BUG_ON(!mem); + + pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->proximity_domain); + list_for_each_entry(dev, device_list, list) { + if (mem->proximity_domain == dev->proximity_domain) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + /* + * We're on GPU node + */ + if (dev->node_props.cpu_cores_count == 0) { + /* APU */ + if (mem->visibility_type == 0) + props->heap_type = + HSA_MEM_HEAP_TYPE_FB_PRIVATE; + /* dGPU */ + else + props->heap_type = mem->visibility_type; + } + else + props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) + props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) + props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; + + props->size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; + props->width = mem->width; + + dev->node_props.mem_banks_count++; + list_add_tail(&props->list, &dev->mem_props); + + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + struct list_head *device_list) +{ + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; + uint32_t total_num_of_cu; + + BUG_ON(!cache); + + id = cache->processor_id_low; + + list_for_each_entry(dev, device_list, list) { + total_num_of_cu = (dev->node_props.array_count * + dev->node_props.cu_per_simd_array); + + /* Cache infomration in CRAT doesn't have proximity_domain + * information as it is associated with a CPU core or GPU + * Compute Unit. So map the cache using CPU core Id or SIMD + * (GPU) ID. + * TODO: This works because currently we can safely assume that + * Compute Units are parsed before caches are parsed. In future + * remove this dependency + */ + if ((id >= dev->node_props.cpu_core_id_base && + id <= dev->node_props.cpu_core_id_base + + dev->node_props.cpu_cores_count) || + (id >= dev->node_props.simd_id_base && + id < dev->node_props.simd_id_base + + total_num_of_cu)) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->processor_id_low = id; + props->cache_level = cache->cache_level; + props->cache_size = cache->cache_size; + props->cacheline_size = cache->cache_line_size; + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; + memcpy(props->sibling_map, cache->sibling_map, + sizeof(props->sibling_map)); + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; + if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) + props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; + if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) + props->cache_type |= HSA_CACHE_TYPE_CPU; + if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) + props->cache_type |= HSA_CACHE_TYPE_HSACU; + + dev->cache_count++; + dev->node_props.caches_count++; + list_add_tail(&props->list, &dev->cache_props); + + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, + struct list_head *device_list) +{ + struct kfd_iolink_properties *props; + struct kfd_topology_device *dev; + uint32_t i = 0; + uint32_t id_from; + uint32_t id_to; + + BUG_ON(!iolink); + + id_from = iolink->proximity_domain_from; + id_to = iolink->proximity_domain_to; + + pr_debug("Found IO link entry in CRAT table with id_from=%d\n", id_from); + list_for_each_entry(dev, device_list, list) { + if (id_from == dev->proximity_domain) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->node_from = id_from; + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; + props->iolink_type = iolink->io_interface_type; + + /* + * weight factor (derived from CDIR), currently always 1 + */ + props->weight = 1; + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; + props->min_bandwidth = iolink->minimum_bandwidth_mbs; + props->max_bandwidth = iolink->maximum_bandwidth_mbs; + props->rec_transfer_size = + iolink->recommended_transfer_size; + + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); + + break; + } + i++; + } + + return 0; +} + +/* kfd_parse_subtype - parse subtypes and attach it to correct topology device + * present in the device_list + * @sub_type_hdr - subtype section of crat_image + * @device_list - list of topology devices present in this crat_image + */ +static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, + struct list_head *device_list) +{ + struct crat_subtype_computeunit *cu; + struct crat_subtype_memory *mem; + struct crat_subtype_cache *cache; + struct crat_subtype_iolink *iolink; + int ret = 0; + + BUG_ON(!sub_type_hdr); + + switch (sub_type_hdr->type) { + case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + ret = kfd_parse_subtype_cu(cu, device_list); + break; + case CRAT_SUBTYPE_MEMORY_AFFINITY: + mem = (struct crat_subtype_memory *)sub_type_hdr; + ret = kfd_parse_subtype_mem(mem, device_list); + break; + case CRAT_SUBTYPE_CACHE_AFFINITY: + cache = (struct crat_subtype_cache *)sub_type_hdr; + ret = kfd_parse_subtype_cache(cache, device_list); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: + /* + * For now, nothing to do here + */ + pr_debug("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: + /* + * For now, nothing to do here + */ + pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: + iolink = (struct crat_subtype_iolink *)sub_type_hdr; + ret = kfd_parse_subtype_iolink(iolink, device_list); + break; + default: + pr_warn("Unknown subtype (%d) in CRAT\n", + sub_type_hdr->type); + } + + return ret; +} + +/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT + * create a kfd_topology_device and add in to device_list. Also parse + * CRAT subtypes and attach it to appropriate kfd_topology_device + * @crat_image - input image containing CRAT + * @device_list - [OUT] list of kfd_topology_device generated after parsing + * crat_image + * @proximity_domain - Proximity domain of the first device in the table + * Return - 0 if successful else -ve value + */ +int kfd_parse_crat_table(void *crat_image, + struct list_head *device_list, + uint32_t proximity_domain) +{ + struct kfd_topology_device *top_dev = NULL; + struct crat_subtype_generic *sub_type_hdr; + uint16_t node_id; + int ret; + struct crat_header *crat_table = (struct crat_header *)crat_image; + uint16_t num_nodes; + uint32_t image_len; + uint32_t last_header_type, last_header_length; + + if (!crat_image) + return -EINVAL; + + if (!list_empty(device_list)) { + pr_warn("Error device list should be empty\n"); + } + + num_nodes = crat_table->num_domains; + image_len = crat_table->length; + + pr_info("Parsing CRAT table with %d nodes\n", num_nodes); + + for (node_id = 0; node_id < num_nodes; node_id++) { + top_dev = kfd_create_topology_device(device_list); + if (!top_dev) + break; + top_dev->proximity_domain = proximity_domain++; + } + + if (!top_dev) + return -ENOMEM; + + memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); + memcpy(top_dev->oem_table_id, crat_table->oem_table_id, CRAT_OEMTABLEID_LENGTH); + top_dev->oem_revision = crat_table->oem_revision; + + last_header_type = last_header_length = 0; + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < + ((char *)crat_image) + image_len) { + pr_debug("kfd parsing crat sub type header %p enabled: %s type: 0x%x length %d\n", + sub_type_hdr, + (sub_type_hdr->flags & + CRAT_SUBTYPE_FLAGS_ENABLED) + ? "true" : "false", + sub_type_hdr->type, + sub_type_hdr->length); + + if (sub_type_hdr->length == 0) { + pr_err("amdkfd: Parsing wrong CRAT's sub header last header type: %d last header len %d\n", + last_header_type, last_header_type); + pr_err("amdkfd: Current header type %d length %d\n", + sub_type_hdr->type, sub_type_hdr->length); + break; + } + + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { + ret = kfd_parse_subtype(sub_type_hdr, device_list); + if (ret != 0) + return ret; + } + + last_header_type = sub_type_hdr->type; + last_header_length = sub_type_hdr->length; + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + } + + return 0; +} + +/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ +static int fill_in_pcache(struct crat_subtype_cache *pcache, + struct kfd_gpu_cache_info *pcache_info, + struct kfd_cu_info *cu_info, + int mem_available, + int cu_bitmask, + int cache_type, unsigned int cu_processor_id, + int cu_block) +{ + unsigned int cu_sibling_map_mask; + int first_active_cu; + + /* First check if enough memory is available */ + if (mem_available - sizeof(struct crat_subtype_cache) < 0) + return -ENOMEM; + + cu_sibling_map_mask = cu_bitmask; + cu_sibling_map_mask >>= cu_block; + cu_sibling_map_mask &= + ((1 << pcache_info[cache_type].num_cu_shared) - 1); + first_active_cu = ffs(cu_sibling_map_mask); + + /* CU could be inactive. In case of shared cache find the first active + * CU. and incase of non-shared cache check if the CU is inactive. If + * inactive active skip it*/ + if (first_active_cu) { + memset(pcache, 0, sizeof(struct crat_subtype_cache)); + pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; + pcache->length = sizeof(struct crat_subtype_cache); + pcache->flags = pcache_info[cache_type].flags; + pcache->processor_id_low = cu_processor_id + + (first_active_cu - 1); + pcache->cache_level = pcache_info[cache_type].cache_level; + pcache->cache_size = pcache_info[cache_type].cache_size; + + /* Sibling map is w.r.t processor_id_low, so shift out + * inactive CU */ + cu_sibling_map_mask = + cu_sibling_map_mask >> (first_active_cu - 1); + + pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); + pcache->sibling_map[1] = + (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); + pcache->sibling_map[2] = + (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); + pcache->sibling_map[3] = + (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); + return 0; + } + return 1; +} + +/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info tables + * @kdev - [IN] GPU device + * @gpu_processor_id - [IN] GPU processor ID to which these caches associate + * @available_size - [IN] Amount of memory available in pcache + * @cu_info - [IN] Compute Unit info obtained from KGD + * @pcache - [OUT] memory into which cache data is to be filled in. + * @size_filled - [OUT] amount of data used up in pcache. + * @num_of_entries - [OUT] number of caches added + */ +static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, + int gpu_processor_id, + int available_size, + struct kfd_cu_info *cu_info, + struct crat_subtype_cache *pcache, + int *size_filled, + int *num_of_entries) +{ + struct kfd_gpu_cache_info *pcache_info; + int num_of_cache_types = 0; + int i, j, k; + int ct = 0; + int mem_available = available_size; + unsigned int cu_processor_id; + int ret; + + switch (kdev->device_info->asic_family) { + case CHIP_KAVERI: + pcache_info = kaveri_cache_info; + num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); + break; + case CHIP_CARRIZO: + pcache_info = carrizo_cache_info; + num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); + break; + case CHIP_TONGA: + pcache_info = tonga_cache_info; + num_of_cache_types = ARRAY_SIZE(tonga_cache_info); + break; + case CHIP_FIJI: + pcache_info = fiji_cache_info; + num_of_cache_types = ARRAY_SIZE(fiji_cache_info); + break; + default: + return -EINVAL; + } + + *size_filled = 0; + *num_of_entries = 0; + + /* For each type of cache listed in the kfd_gpu_cache_info table, + * go through all available Compute Units. + * The [i,j,k] loop will + * if kfd_gpu_cache_info.num_cu_shared = 1 + * will parse through all available CU + * If (kfd_gpu_cache_info.num_cu_shared != 1) + * then it will consider only one CU from + * the shared unit + */ + + for (ct = 0; ct < num_of_cache_types; ct++) { + cu_processor_id = gpu_processor_id; + for (i = 0; i < cu_info->num_shader_engines; i++) { + for (j = 0; j < cu_info->num_shader_arrays_per_engine; + j++) { + for (k = 0; k < cu_info->num_cu_per_sh; + k += pcache_info[ct].num_cu_shared) { + + ret = fill_in_pcache(pcache, + pcache_info, + cu_info, + mem_available, + cu_info->cu_bitmap[i][j], + ct, + cu_processor_id, + k); + + if (ret < 0) + break; + + if (!ret) { + pcache++; + (*num_of_entries)++; + mem_available -= + sizeof(*pcache); + (*size_filled) += + sizeof(*pcache); + } + + /* Move to next CU block */ + cu_processor_id += + pcache_info[ct].num_cu_shared; + } + } + } + } + + pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); + + return 0; +} + +/* + * kfd_create_crat_image_acpi - Allocates memory for CRAT image and + * copies CRAT from ACPI (if available). + * + * NOTE: Call kfd_destroy_crat_image to free CRAT image memory + * + * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then + * *crat_image will be NULL + * @size: [OUT] size of crat_image + * + * Return 0 if successful else return -ve value + */ +int kfd_create_crat_image_acpi(void **crat_image, size_t *size) +{ + struct acpi_table_header *crat_table; + acpi_status status; + void *pcrat_image; + + if (!crat_image) + return -EINVAL; + + *crat_image = NULL; + + /* + * Fetch the CRAT table from ACPI + */ + status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); + if (status == AE_NOT_FOUND) { + pr_warn("CRAT table not found\n"); + return -ENODATA; + } else if (ACPI_FAILURE(status)) { + const char *err = acpi_format_exception(status); + pr_err("CRAT table error: %s\n", err); + return -EINVAL; + } + + pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); + if (!pcrat_image) { + pr_err("No memory for allocating CRAT image\n"); + return -ENOMEM; + } + + memcpy(pcrat_image, crat_table, crat_table->length); + + *crat_image = pcrat_image; + *size = crat_table->length; + + return 0; +} + +/* Memory required to create Virtual CRAT. + * Since there is no easy way to predict the amount of memory required, the + * following amount are allocated for CPU and GPU Virtual CRAT. This is + * expected to cover all known conditions. But to be safe additional check + * is put in the code to ensure we don't overwrite. + */ +#define VCRAT_SIZE_FOR_CPU PAGE_SIZE +#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) + +/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node + * + * @numa_node_id: CPU NUMA node id + * @avail_size: Available size in the memory + * @sub_type_hdr: Memory into which compute info will be filled in + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, + int proximity_domain, + struct crat_subtype_computeunit *sub_type_hdr) +{ + const struct cpumask *cpumask; + + *avail_size -= sizeof(struct crat_subtype_computeunit); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + cpumask = cpumask_of_node(numa_node_id); + + /* Fill in CU data */ + sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; + sub_type_hdr->proximity_domain = proximity_domain; + sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); + if (sub_type_hdr->processor_id_low == -1) + return -EINVAL; + + sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); + + return 0; +} + +/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node + * + * @numa_node_id: CPU NUMA node id + * @avail_size: Available size in the memory + * @sub_type_hdr: Memory into which compute info will be filled in + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, + int proximity_domain, + struct crat_subtype_memory *sub_type_hdr) +{ + uint64_t mem_in_bytes = 0; + pg_data_t *pgdat; + int zone_type; + + *avail_size -= sizeof(struct crat_subtype_computeunit); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_memory); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in Memory Subunit data */ + + /* Unlike si_meminfo, si_meminfo_node is not exported. So + * the following lines are duplicated from si_meminfo_node + * function */ + pgdat = NODE_DATA(numa_node_id); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; + mem_in_bytes <<= PAGE_SHIFT; + + sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); + sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); + sub_type_hdr->proximity_domain = proximity_domain; + + return 0; +} + +/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU + * + * @pcrat_image: Fill in VCRAT for CPU + * @size: [IN] allocated size of crat_image. + * [OUT] actual size of data filled in crat_image + */ +static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) +{ + struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct acpi_table_header *acpi_table; + acpi_status status; + struct crat_subtype_generic *sub_type_hdr; + int avail_size = *size; + int numa_node_id; + int ret = 0; + + if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_CPU) + return -EINVAL; + + /* Fill in CRAT Header. + * Modify length and total_entries as subunits are added. + */ + avail_size -= sizeof(struct crat_header); + if (avail_size < 0) + return -ENOMEM; + + memset(crat_table, 0, sizeof(struct crat_header)); + memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature)); + crat_table->length = sizeof(struct crat_header); + + status = acpi_get_table("DSDT", 0, &acpi_table); + if (status == AE_NOT_FOUND) + pr_warn("DSDT table not found for OEM information\n"); + else { + crat_table->oem_revision = acpi_table->revision; + memcpy(crat_table->oem_id, acpi_table->oem_id, CRAT_OEMID_LENGTH); + memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, CRAT_OEMTABLEID_LENGTH); + } + crat_table->total_entries = 0; + crat_table->num_domains = 0; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + + for_each_online_node(numa_node_id) { + /* Fill in Subtype: Compute Unit */ + ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, + crat_table->num_domains, + (struct crat_subtype_computeunit *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + /* Fill in Subtype: Memory */ + ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, + crat_table->num_domains, + (struct crat_subtype_memory *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + crat_table->num_domains++; + } + + /* TODO: Add cache Subtype for CPU. + * Currently, CPU cache information is available in function + * detect_cache_attributes(cpu) defined in the file + * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not exported + * and to get the same information the code needs to be duplicated. + */ + + *size = crat_table->length; + pr_info("Virtual CRAT table created for CPU\n"); + + return 0; +} + +static int kfd_fill_gpu_memory_affinity(int *avail_size, + struct kfd_dev *kdev, uint8_t type, uint64_t size, + struct crat_subtype_memory *sub_type_hdr, + uint32_t proximity_domain, + const struct kfd_local_mem_info *local_mem_info) +{ + *avail_size -= sizeof(struct crat_subtype_memory); + if (*avail_size < 0) + return -ENOMEM; + + memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); + sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_memory); + sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; + + sub_type_hdr->proximity_domain = proximity_domain; + + pr_debug("amdkfd: fill gpu memory affinity - type 0x%x size 0x%llx\n", + type, size); + + sub_type_hdr->length_low = lower_32_bits(size); + sub_type_hdr->length_high = upper_32_bits(size); + + sub_type_hdr->width = local_mem_info->vram_width; + sub_type_hdr->visibility_type = type; + + return 0; +} + +/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU + * to its NUMA node + * + * @avail_size: Available size in the memory + * @kdev - [IN] GPU device + * @sub_type_hdr: Memory into which io link info will be filled in + * @proximity_domain - proximity domain of the GPU node + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_gpu_direct_io_link(int *avail_size, + struct kfd_dev *kdev, + struct crat_subtype_iolink *sub_type_hdr, + uint32_t proximity_domain) +{ + int proximity_domain_to; + *avail_size -= sizeof(struct crat_subtype_iolink); + if (*avail_size < 0) + return -ENOMEM; + + memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_iolink); + sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in IOLINK subtype. + * TODO: Fill-in other fields of iolink subtype */ + sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; + sub_type_hdr->proximity_domain_from = proximity_domain; + proximity_domain_to = + kfd_get_proximity_domain(kdev->pdev->bus); + if (proximity_domain_to == -1) + return -EINVAL; + + sub_type_hdr->proximity_domain_to = proximity_domain_to; + return 0; +} + +/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU + * + * @pcrat_image: Fill in VCRAT for GPU + * @size: [IN] allocated size of crat_image. + * [OUT] actual size of data filled in crat_image + */ +static int kfd_create_vcrat_image_gpu(void *pcrat_image, + size_t *size, struct kfd_dev *kdev, + uint32_t proximity_domain) +{ + struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct crat_subtype_generic *sub_type_hdr; + struct crat_subtype_computeunit *cu; + struct kfd_cu_info cu_info; + struct amd_iommu_device_info iommu_info; + int avail_size = *size; + uint32_t total_num_of_cu; + int num_of_cache_entries = 0; + int cache_mem_filled = 0; + int ret = 0; + const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | + AMD_IOMMU_DEVICE_FLAG_PRI_SUP | + AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + struct kfd_local_mem_info local_mem_info; + + if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_GPU) + return -EINVAL; + + /* Fill the CRAT Header. + * Modify length and total_entries as subunits are added. + */ + avail_size -= sizeof(struct crat_header); + if (avail_size < 0) + return -ENOMEM; + + memset(crat_table, 0, sizeof(struct crat_header)); + + memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature)); + crat_table->length = sizeof(struct crat_header); /* Change length as we add more subtypes*/ + crat_table->num_domains = 1; + crat_table->total_entries = 0; + + /* Fill in Subtype: Compute Unit + * First fill in the sub type header and then sub type data + */ + avail_size -= sizeof(struct crat_subtype_computeunit); + if (avail_size < 0) + return -ENOMEM; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); + + sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill CU subtype data */ + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; + cu->proximity_domain = proximity_domain; + + kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); + cu->num_simd_per_cu = cu_info.simd_per_cu; + cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; + cu->max_waves_simd = cu_info.max_waves_per_simd; + + cu->wave_front_size = cu_info.wave_front_size; + cu->array_count = cu_info.num_shader_arrays_per_engine * + cu_info.num_shader_engines; + total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); + cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); + cu->num_cu_per_array = cu_info.num_cu_per_sh; + cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; + cu->num_banks = cu_info.num_shader_engines; + cu->lds_size_in_kb = cu_info.lds_size; + + cu->hsa_capability = 0; + + /* Check if this node supports IOMMU. During parsing this flag will + * translate to HSA_CAP_ATS_PRESENT */ + iommu_info.flags = 0; + if (0 == amd_iommu_device_info(kdev->pdev, &iommu_info)) { + if ((iommu_info.flags & required_iommu_flags) == required_iommu_flags) + cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; + } + + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + /* Fill in Subtype: Memory. Only on systems with large BAR (no + * private FB), report memory as public. On other systems + * report the total FB size (public+private) as a single + * private heap. */ + kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + if (local_mem_info.local_mem_size_private == 0) + ret = kfd_fill_gpu_memory_affinity(&avail_size, + kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, + local_mem_info.local_mem_size_public, + (struct crat_subtype_memory *)sub_type_hdr, + proximity_domain, + &local_mem_info); + else + ret = kfd_fill_gpu_memory_affinity(&avail_size, + kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, + local_mem_info.local_mem_size_public + + local_mem_info.local_mem_size_private, + (struct crat_subtype_memory *)sub_type_hdr, + proximity_domain, + &local_mem_info); + if (ret < 0) + return ret; + + crat_table->length += sizeof(struct crat_subtype_memory); + crat_table->total_entries++; + + /* TODO: Fill in cache information. This information is NOT readily + * available in KGD */ + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, + avail_size, + &cu_info, + (struct crat_subtype_cache *)sub_type_hdr, + &cache_mem_filled, + &num_of_cache_entries); + + if (ret < 0) + return ret; + + crat_table->length += cache_mem_filled; + crat_table->total_entries += num_of_cache_entries; + avail_size -= cache_mem_filled; + + /* Fill in Subtype: IO_LINKS + * Only direct links are added here which is Link from GPU to + * to its NUMA node. Indirect links are added by userspace. + */ + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + cache_mem_filled); + ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, + (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); + + if (ret < 0) + return ret; + + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + *size = crat_table->length; + pr_info("Virtual CRAT table created for GPU\n"); + + return ret; +} + +/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and + * creates a Virtual CRAT (VCRAT) image + * + * NOTE: Call kfd_destroy_crat_image to free CRAT image memory + * + * @crat_image: VCRAT image created because ACPI does not have a + * CRAT for this device + * @size: [OUT] size of virtual crat_image + * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device + * COMPUTE_UNIT_GPU - Create VCRAT for GPU + * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU + * -- this option is not currently implemented. The assumption + * is that all AMD APUs will have CRAT + * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU + * + * Return 0 if successful else return -ve value +*/ +int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + int flags, struct kfd_dev *kdev, uint32_t proximity_domain) +{ + void *pcrat_image; + int ret = 0; + + if (!crat_image) + return -EINVAL; + + *crat_image = NULL; + + /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and + * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover + * all the current conditions. A check is put not to overwrite beyond + * allocated size + */ + switch (flags) { + case COMPUTE_UNIT_CPU: + pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + *size = VCRAT_SIZE_FOR_CPU; + ret = kfd_create_vcrat_image_cpu(pcrat_image, size); + break; + case COMPUTE_UNIT_GPU: + if (!kdev) + return -EINVAL; + pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + *size = VCRAT_SIZE_FOR_GPU; + ret = kfd_create_vcrat_image_gpu(pcrat_image, size, + kdev, proximity_domain); + break; + case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) : + /*TODO:*/ + ret = -EINVAL; + pr_err("VCRAT not implemented for APU\n"); + break; + default: + ret = -EINVAL; + } + + if (ret == 0) + *crat_image = pcrat_image; + + return ret; +} + + +/* kfd_destroy_crat_image + * + * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) + * + */ +void kfd_destroy_crat_image(void *crat_image) +{ + if (crat_image) + kfree(crat_image); + return; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index a374fa3d3ee6..9af3745646df 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -24,6 +24,7 @@ #define KFD_CRAT_H_INCLUDED #include <linux/types.h> +#include "kfd_priv.h" #pragma pack(1) @@ -44,6 +45,10 @@ #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) +/* Compute Unit flags */ +#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ +#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ + struct crat_header { uint32_t signature; uint32_t length; @@ -105,7 +110,7 @@ struct crat_subtype_computeunit { uint8_t wave_front_size; uint8_t num_banks; uint16_t micro_engine_id; - uint8_t num_arrays; + uint8_t array_count; uint8_t num_cu_per_array; uint8_t num_simd_per_cu; uint8_t max_slots_scatch_cu; @@ -127,13 +132,14 @@ struct crat_subtype_memory { uint8_t length; uint16_t reserved; uint32_t flags; - uint32_t promixity_domain; + uint32_t proximity_domain; uint32_t base_addr_low; uint32_t base_addr_high; uint32_t length_low; uint32_t length_high; uint32_t width; - uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; + uint8_t visibility_type; /* for virtual (dGPU) CRAT */ + uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; }; /* @@ -222,9 +228,12 @@ struct crat_subtype_ccompute { /* * HSA IO Link Affinity structure and definitions */ -#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 -#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 -#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc +#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) +#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) +#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) +#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 /* * IO interface types @@ -232,8 +241,16 @@ struct crat_subtype_ccompute { #define CRAT_IOLINK_TYPE_UNDEFINED 0 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 -#define CRAT_IOLINK_TYPE_OTHER 3 -#define CRAT_IOLINK_TYPE_MAX 255 +#define CRAT_IOLINK_TYPE_AMBA 3 +#define CRAT_IOLINK_TYPE_MIPI 4 +#define CRAT_IOLINK_TYPE_QPI_1_1 5 +#define CRAT_IOLINK_TYPE_RESERVED1 6 +#define CRAT_IOLINK_TYPE_RESERVED2 7 +#define CRAT_IOLINK_TYPE_RAPID_IO 8 +#define CRAT_IOLINK_TYPE_INFINIBAND 9 +#define CRAT_IOLINK_TYPE_RESERVED3 10 +#define CRAT_IOLINK_TYPE_OTHER 11 +#define CRAT_IOLINK_TYPE_MAX 255 #define CRAT_IOLINK_RESERVED_LENGTH 24 @@ -291,4 +308,11 @@ struct cdit_header { #pragma pack() +int kfd_create_crat_image_acpi(void **crat_image, size_t *size); +void kfd_destroy_crat_image(void *crat_image); +int kfd_parse_crat_table(void *crat_image, + struct list_head *device_list, + uint32_t proximity_domain); +int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + int flags, struct kfd_dev *kdev, uint32_t proximity_domain); #endif /* KFD_CRAT_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c index d5e19b5fbbfb..4f2311e703c5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -42,8 +42,6 @@ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) { - BUG_ON(!dev || !dev->kfd2kgd); - dev->kfd2kgd->address_watch_disable(dev->kgd); } @@ -51,129 +49,118 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, unsigned int pasid, uint64_t vmid0_address, uint32_t *packet_buff, size_t size_in_bytes) { + int status = 0; + unsigned int *ib_packet_buff = NULL; struct pm4__release_mem *rm_packet; struct pm4__indirect_buffer_pasid *ib_packet; + struct kernel_queue *kq = dbgdev->kq; + size_t pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + sizeof(struct pm4__indirect_buffer_pasid); struct kfd_mem_obj *mem_obj; - size_t pq_packets_size_in_bytes; + + uint64_t *rm_state = NULL; + union ULARGE_INTEGER *largep; union ULARGE_INTEGER addr; - struct kernel_queue *kq; - uint64_t *rm_state; - unsigned int *ib_packet_buff; - int status; - - BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes); - - kq = dbgdev->kq; - - pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + - sizeof(struct pm4__indirect_buffer_pasid); - - /* - * We acquire a buffer from DIQ - * The receive packet buff will be sitting on the Indirect Buffer - * and in the PQ we put the IB packet + sync packet(s). - */ - status = kq->ops.acquire_packet_buffer(kq, - pq_packets_size_in_bytes / sizeof(uint32_t), - &ib_packet_buff); - if (status != 0) { - pr_err("amdkfd: acquire_packet_buffer failed\n"); - return status; - } - memset(ib_packet_buff, 0, pq_packets_size_in_bytes); + do { + if ((kq == NULL) || (packet_buff == NULL) || (size_in_bytes == 0)) { + pr_debug("Error! kfd: In func %s >> Illegal packet parameters\n", __func__); + status = -EINVAL; + break; + } + /* todo - enter proper locking to be multithreaded safe */ - ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); + /* We acquire a buffer from DIQ + * The receive packet buff will be sitting on the Indirect Buffer + * and in the PQ we put the IB packet + sync packet(s). + */ + status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff); + if (status != 0) { + pr_debug("Error! kfd: In func %s >> acquire_packet_buffer failed\n", __func__); + break; + } - ib_packet->header.count = 3; - ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; - ib_packet->header.type = PM4_TYPE_3; + memset(ib_packet_buff, 0, pq_packets_size_in_bytes); - largep = (union ULARGE_INTEGER *) &vmid0_address; + ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); - ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; - ib_packet->bitfields3.ib_base_hi = largep->u.high_part; + ib_packet->header.count = 3; + ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; + ib_packet->header.type = PM4_TYPE_3; - ib_packet->control = (1 << 23) | (1 << 31) | - ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); + largep = (union ULARGE_INTEGER *) &vmid0_address; - ib_packet->bitfields5.pasid = pasid; + ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; + ib_packet->bitfields3.ib_base_hi = largep->u.high_part; - /* - * for now we use release mem for GPU-CPU synchronization - * Consider WaitRegMem + WriteData as a better alternative - * we get a GART allocations ( gpu/cpu mapping), - * for the sync variable, and wait until: - * (a) Sync with HW - * (b) Sync var is written by CP to mem. - */ - rm_packet = (struct pm4__release_mem *) (ib_packet_buff + - (sizeof(struct pm4__indirect_buffer_pasid) / - sizeof(unsigned int))); + ib_packet->control = (1 << 23) | (1 << 31) | + ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); - status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), - &mem_obj); + ib_packet->bitfields5.pasid = pasid; - if (status != 0) { - pr_err("amdkfd: Failed to allocate GART memory\n"); - kq->ops.rollback_packet(kq); - return status; - } + /* + * for now we use release mem for GPU-CPU synchronization + * Consider WaitRegMem + WriteData as a better alternative + * we get a GART allocations ( gpu/cpu mapping), + * for the sync variable, and wait until: + * (a) Sync with HW + * (b) Sync var is written by CP to mem. + */ + rm_packet = (struct pm4__release_mem *) (ib_packet_buff + + (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int))); - rm_state = (uint64_t *) mem_obj->cpu_ptr; + status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), + &mem_obj); - *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; + if (status == 0) { - rm_packet->header.opcode = IT_RELEASE_MEM; - rm_packet->header.type = PM4_TYPE_3; - rm_packet->header.count = sizeof(struct pm4__release_mem) / - sizeof(unsigned int) - 2; + rm_state = (uint64_t *) mem_obj->cpu_ptr; - rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; - rm_packet->bitfields2.event_index = - event_index___release_mem__end_of_pipe; + *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; - rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; - rm_packet->bitfields2.atc = 0; - rm_packet->bitfields2.tc_wb_action_ena = 1; + rm_packet->header.opcode = IT_RELEASE_MEM; + rm_packet->header.type = PM4_TYPE_3; + rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2; - addr.quad_part = mem_obj->gpu_addr; + rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + rm_packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; + rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; + rm_packet->bitfields2.atc = 0; + rm_packet->bitfields2.tc_wb_action_ena = 1; - rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; - rm_packet->address_hi = addr.u.high_part; + addr.quad_part = mem_obj->gpu_addr; - rm_packet->bitfields3.data_sel = - data_sel___release_mem__send_64_bit_data; + rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; + rm_packet->address_hi = addr.u.high_part; - rm_packet->bitfields3.int_sel = - int_sel___release_mem__send_data_after_write_confirm; + rm_packet->bitfields3.data_sel = data_sel___release_mem__send_64_bit_data; + rm_packet->bitfields3.int_sel = int_sel___release_mem__send_data_after_write_confirm; + rm_packet->bitfields3.dst_sel = dst_sel___release_mem__memory_controller; - rm_packet->bitfields3.dst_sel = - dst_sel___release_mem__memory_controller; + rm_packet->data_lo = QUEUESTATE__ACTIVE; - rm_packet->data_lo = QUEUESTATE__ACTIVE; + kq->ops.submit_packet(kq); - kq->ops.submit_packet(kq); + /* Wait till CP writes sync code: */ - /* Wait till CP writes sync code: */ - status = amdkfd_fence_wait_timeout( - (unsigned int *) rm_state, - QUEUESTATE__ACTIVE, 1500); + status = amdkfd_fence_wait_timeout( + (unsigned int *) rm_state, + QUEUESTATE__ACTIVE, 1500); + + } else { + pr_debug("Error! kfd: In func %s >> failed to allocate GART memory\n", __func__); + } + } while (false); - kfd_gtt_sa_free(dbgdev->dev, mem_obj); + if (rm_state != NULL) + kfd_gtt_sa_free(dbgdev->dev, mem_obj); return status; } static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) { - BUG_ON(!dbgdev); - - /* - * no action is needed in this case, - * just make sure diq will not be used - */ + /* no action is needed in this case, just make sure diq will not be used */ dbgdev->kq = NULL; @@ -182,57 +169,68 @@ static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) { + + int status = 0; + struct kernel_queue *kq = NULL; struct queue_properties properties; unsigned int qid; - struct kernel_queue *kq = NULL; - int status; + struct process_queue_manager *pqm = dbgdev->pqm; - BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev); + do { - status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, - &properties, 0, KFD_QUEUE_TYPE_DIQ, - &qid); + if (!pqm) { + pr_debug("Error! kfd: In func %s >> No PQM\n", __func__); + status = -EFAULT; + break; + } - if (status) { - pr_err("amdkfd: Failed to create DIQ\n"); - return status; - } + properties.type = KFD_QUEUE_TYPE_DIQ; - pr_debug("DIQ Created with queue id: %d\n", qid); + status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, + &properties, &qid); - kq = pqm_get_kernel_queue(dbgdev->pqm, qid); + if (status != 0) { + pr_debug("Error! kfd: In func %s >> Create Queue failed\n", __func__); + break; + } - if (kq == NULL) { - pr_err("amdkfd: Error getting DIQ\n"); - pqm_destroy_queue(dbgdev->pqm, qid); - return -EFAULT; - } + pr_debug("kfd: DIQ Created with queue id: %d\n", qid); + + kq = pqm_get_kernel_queue(dbgdev->pqm, qid); + + if (kq == NULL) { + pr_debug("Error! kfd: In func %s >> Error getting Kernel Queue\n", __func__); + status = -ENOMEM; + break; + } + + dbgdev->kq = kq; - dbgdev->kq = kq; + } while (false); return status; } static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev) { - BUG_ON(!dbgdev || !dbgdev->dev); - /* disable watch address */ + dbgdev_address_watch_disable_nodiq(dbgdev->dev); return 0; } static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev) { - /* todo - disable address watch */ - int status; - - BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq); - - status = pqm_destroy_queue(dbgdev->pqm, - dbgdev->kq->queue->properties.queue_id); - dbgdev->kq = NULL; - + /* todo - if needed, kill wavefronts and disable watch */ + int status = 0; + if ((dbgdev == NULL) || (dbgdev->pqm == NULL) || (dbgdev->kq == NULL)) { + pr_debug("kfd Err:In func %s >> can't destroy diq\n", __func__); + status = -EFAULT; + } else { + pqm_destroy_queue(dbgdev->pqm, + dbgdev->kq->queue->properties.queue_id); + dbgdev->kq = NULL; + } return status; } @@ -241,341 +239,350 @@ static void dbgdev_address_watch_set_registers( union TCP_WATCH_ADDR_H_BITS *addrHi, union TCP_WATCH_ADDR_L_BITS *addrLo, union TCP_WATCH_CNTL_BITS *cntl, - unsigned int index, unsigned int vmid) + unsigned int index, unsigned int vmid, + unsigned int asic_family) { union ULARGE_INTEGER addr; - BUG_ON(!adw_info || !addrHi || !addrLo || !cntl); - addr.quad_part = 0; addrHi->u32All = 0; addrLo->u32All = 0; cntl->u32All = 0; if (adw_info->watch_mask != NULL) - cntl->bitfields.mask = - (uint32_t) (adw_info->watch_mask[index] & - ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); + cntl->bitfields.mask = (uint32_t) (adw_info->watch_mask[index] & ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); else cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; addr.quad_part = (unsigned long long) adw_info->watch_address[index]; - addrHi->bitfields.addr = addr.u.high_part & - ADDRESS_WATCH_REG_ADDHIGH_MASK; + addrHi->bitfields.addr = addr.u.high_part & ADDRESS_WATCH_REG_ADDHIGH_MASK; addrLo->bitfields.addr = (addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT); cntl->bitfields.mode = adw_info->watch_mode[index]; cntl->bitfields.vmid = (uint32_t) vmid; - /* for now assume it is an ATC address */ - cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; - + /* for APU assume it is an ATC address. */ + if (KFD_IS_DGPU(asic_family) == false) + cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); - pr_debug("\t\t%20s %08x\n", "set reg add high :", - addrHi->bitfields.addr); - pr_debug("\t\t%20s %08x\n", "set reg add low :", - addrLo->bitfields.addr); + pr_debug("\t\t%20s %08x\n", "set reg add high :", addrHi->bitfields.addr); + pr_debug("\t\t%20s %08x\n", "set reg add low :", addrLo->bitfields.addr); + } static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, struct dbg_address_watch_info *adw_info) { + + int status = 0; + union TCP_WATCH_ADDR_H_BITS addrHi; union TCP_WATCH_ADDR_L_BITS addrLo; union TCP_WATCH_CNTL_BITS cntl; - struct kfd_process_device *pdd; + + unsigned int vmid; unsigned int i; - BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); + struct kfd_process_device *pdd; - /* taking the vmid for that process on the safe way using pdd */ - pdd = kfd_get_process_device_data(dbgdev->dev, - adw_info->process); - if (!pdd) { - pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); - return -EFAULT; - } + do { + /* taking the vmid for that process on the safe way using pdd */ + pdd = kfd_get_process_device_data(dbgdev->dev, + adw_info->process); + if (!pdd) { + pr_debug("Error! kfd: In func %s >> no PDD available\n", __func__); + status = -EFAULT; + break; + } - addrHi.u32All = 0; - addrLo.u32All = 0; - cntl.u32All = 0; + addrHi.u32All = 0; + addrLo.u32All = 0; + cntl.u32All = 0; - if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || - (adw_info->num_watch_points == 0)) { - pr_err("amdkfd: num_watch_points is invalid\n"); - return -EINVAL; - } + vmid = pdd->qpd.vmid; - if ((adw_info->watch_mode == NULL) || - (adw_info->watch_address == NULL)) { - pr_err("amdkfd: adw_info fields are not valid\n"); - return -EINVAL; - } + if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) + || (adw_info->num_watch_points == 0)) { + status = -EINVAL; + break; + } - for (i = 0 ; i < adw_info->num_watch_points ; i++) { - dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, - &cntl, i, pdd->qpd.vmid); - - pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); - pr_debug("\t\t%20s %08x\n", "register index :", i); - pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid); - pr_debug("\t\t%20s %08x\n", "Address Low is :", - addrLo.bitfields.addr); - pr_debug("\t\t%20s %08x\n", "Address high is :", - addrHi.bitfields.addr); - pr_debug("\t\t%20s %08x\n", "Address high is :", - addrHi.bitfields.addr); - pr_debug("\t\t%20s %08x\n", "Control Mask is :", - cntl.bitfields.mask); - pr_debug("\t\t%20s %08x\n", "Control Mode is :", - cntl.bitfields.mode); - pr_debug("\t\t%20s %08x\n", "Control Vmid is :", - cntl.bitfields.vmid); - pr_debug("\t\t%20s %08x\n", "Control atc is :", - cntl.bitfields.atc); - pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); - - pdd->dev->kfd2kgd->address_watch_execute( - dbgdev->dev->kgd, - i, - cntl.u32All, - addrHi.u32All, - addrLo.u32All); - } + if ((adw_info->watch_mode == NULL) || (adw_info->watch_address == NULL)) { + status = -EINVAL; + break; + } - return 0; + for (i = 0; i < adw_info->num_watch_points; i++) { + + dbgdev_address_watch_set_registers( + adw_info, + &addrHi, + &addrLo, + &cntl, + i, + vmid, + dbgdev->dev->device_info->asic_family + ); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); + pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); + pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask); + pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode); + pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid); + pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc); + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + + pdd->dev->kfd2kgd->address_watch_execute( + dbgdev->dev->kgd, + i, + cntl.u32All, + addrHi.u32All, + addrLo.u32All); + } + + } while (false); + + return status; } static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, struct dbg_address_watch_info *adw_info) { - struct pm4__set_config_reg *packets_vec; + + int status = 0; + unsigned int i = 0; union TCP_WATCH_ADDR_H_BITS addrHi; union TCP_WATCH_ADDR_L_BITS addrLo; union TCP_WATCH_CNTL_BITS cntl; - struct kfd_mem_obj *mem_obj; - unsigned int aw_reg_add_dword; - uint32_t *packet_buff_uint; - unsigned int i; - int status; - size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; + /* we do not control the vmid in DIQ mode, just a place holder */ unsigned int vmid = 0; - BUG_ON(!dbgdev || !dbgdev->dev || !adw_info); + struct kfd_mem_obj *mem_obj; + uint32_t *packet_buff_uint = NULL; + + struct pm4__set_config_reg *packets_vec = NULL; + + size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; + + unsigned int aw_reg_add_dword; addrHi.u32All = 0; addrLo.u32All = 0; cntl.u32All = 0; - if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || - (adw_info->num_watch_points == 0)) { - pr_err("amdkfd: num_watch_points is invalid\n"); - return -EINVAL; - } + do { - if ((NULL == adw_info->watch_mode) || - (NULL == adw_info->watch_address)) { - pr_err("amdkfd: adw_info fields are not valid\n"); - return -EINVAL; - } + if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || (adw_info->num_watch_points == 0)) { + status = -EINVAL; + break; + } + + if ((NULL == adw_info->watch_mode) || (NULL == adw_info->watch_address)) { + status = -EINVAL; + break; + } - status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); - if (status != 0) { - pr_err("amdkfd: Failed to allocate GART memory\n"); - return status; - } + if (status != 0) + break; - packet_buff_uint = mem_obj->cpu_ptr; - - memset(packet_buff_uint, 0, ib_size); - - packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); - - packets_vec[0].header.count = 1; - packets_vec[0].header.opcode = IT_SET_CONFIG_REG; - packets_vec[0].header.type = PM4_TYPE_3; - packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; - packets_vec[0].bitfields2.insert_vmid = 1; - packets_vec[1].ordinal1 = packets_vec[0].ordinal1; - packets_vec[1].bitfields2.insert_vmid = 0; - packets_vec[2].ordinal1 = packets_vec[0].ordinal1; - packets_vec[2].bitfields2.insert_vmid = 0; - packets_vec[3].ordinal1 = packets_vec[0].ordinal1; - packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; - packets_vec[3].bitfields2.insert_vmid = 1; - - for (i = 0; i < adw_info->num_watch_points; i++) { - dbgdev_address_watch_set_registers(adw_info, - &addrHi, - &addrLo, - &cntl, - i, - vmid); - - pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); - pr_debug("\t\t%20s %08x\n", "register index :", i); - pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); - pr_debug("\t\t%20s %p\n", "Add ptr is :", - adw_info->watch_address); - pr_debug("\t\t%20s %08llx\n", "Add is :", - adw_info->watch_address[i]); - pr_debug("\t\t%20s %08x\n", "Address Low is :", - addrLo.bitfields.addr); - pr_debug("\t\t%20s %08x\n", "Address high is :", - addrHi.bitfields.addr); - pr_debug("\t\t%20s %08x\n", "Control Mask is :", - cntl.bitfields.mask); - pr_debug("\t\t%20s %08x\n", "Control Mode is :", - cntl.bitfields.mode); - pr_debug("\t\t%20s %08x\n", "Control Vmid is :", - cntl.bitfields.vmid); - pr_debug("\t\t%20s %08x\n", "Control atc is :", - cntl.bitfields.atc); - pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); - - aw_reg_add_dword = - dbgdev->dev->kfd2kgd->address_watch_get_offset( - dbgdev->dev->kgd, - i, - ADDRESS_WATCH_REG_CNTL); + packet_buff_uint = mem_obj->cpu_ptr; + + memset(packet_buff_uint, 0, ib_size); - aw_reg_add_dword /= sizeof(uint32_t); + packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); - packets_vec[0].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; + packets_vec[0].header.count = 1; + packets_vec[0].header.opcode = IT_SET_CONFIG_REG; + packets_vec[0].header.type = PM4_TYPE_3; + packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; + packets_vec[0].bitfields2.insert_vmid = 1; + packets_vec[1].ordinal1 = packets_vec[0].ordinal1; + packets_vec[1].bitfields2.insert_vmid = 0; + packets_vec[2].ordinal1 = packets_vec[0].ordinal1; + packets_vec[2].bitfields2.insert_vmid = 0; + packets_vec[3].ordinal1 = packets_vec[0].ordinal1; + packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; + packets_vec[3].bitfields2.insert_vmid = 1; - packets_vec[0].reg_data[0] = cntl.u32All; + for (i = 0; i < adw_info->num_watch_points; i++) { - aw_reg_add_dword = - dbgdev->dev->kfd2kgd->address_watch_get_offset( - dbgdev->dev->kgd, + dbgdev_address_watch_set_registers( + adw_info, + &addrHi, + &addrLo, + &cntl, i, - ADDRESS_WATCH_REG_ADDR_HI); + vmid, + dbgdev->dev->device_info->asic_family + ); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); + pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); + pr_debug("\t\t%20s %p\n", "Add ptr is :", adw_info->watch_address); + pr_debug("\t\t%20s %08llx\n", "Add is :", adw_info->watch_address[i]); + pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask); + pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode); + pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid); + pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc); + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + + aw_reg_add_dword = + dbgdev->dev->kfd2kgd + ->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_CNTL); + + packets_vec[0].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; + packets_vec[0].reg_data[0] = cntl.u32All; - aw_reg_add_dword /= sizeof(uint32_t); + aw_reg_add_dword = + dbgdev->dev->kfd2kgd + ->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_ADDR_HI); - packets_vec[1].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; - packets_vec[1].reg_data[0] = addrHi.u32All; - aw_reg_add_dword = - dbgdev->dev->kfd2kgd->address_watch_get_offset( - dbgdev->dev->kgd, - i, - ADDRESS_WATCH_REG_ADDR_LO); + packets_vec[1].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; + packets_vec[1].reg_data[0] = addrHi.u32All; - aw_reg_add_dword /= sizeof(uint32_t); + aw_reg_add_dword = + dbgdev->dev->kfd2kgd + ->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_ADDR_LO); - packets_vec[2].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; - packets_vec[2].reg_data[0] = addrLo.u32All; - /* enable watch flag if address is not zero*/ - if (adw_info->watch_address[i] > 0) - cntl.bitfields.valid = 1; - else - cntl.bitfields.valid = 0; + packets_vec[2].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; + packets_vec[2].reg_data[0] = addrLo.u32All; - aw_reg_add_dword = - dbgdev->dev->kfd2kgd->address_watch_get_offset( - dbgdev->dev->kgd, - i, - ADDRESS_WATCH_REG_CNTL); + /* enable watch flag if address is not zero*/ + if (adw_info->watch_address[i] > 0) + cntl.bitfields.valid = 1; + else + cntl.bitfields.valid = 0; - aw_reg_add_dword /= sizeof(uint32_t); + aw_reg_add_dword = + dbgdev->dev->kfd2kgd + ->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_CNTL); - packets_vec[3].bitfields2.reg_offset = - aw_reg_add_dword - AMD_CONFIG_REG_BASE; - packets_vec[3].reg_data[0] = cntl.u32All; - status = dbgdev_diq_submit_ib( - dbgdev, - adw_info->process->pasid, - mem_obj->gpu_addr, - packet_buff_uint, - ib_size); + packets_vec[3].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE; + packets_vec[3].reg_data[0] = cntl.u32All; + + status = dbgdev_diq_submit_ib( + dbgdev, + adw_info->process->pasid, + mem_obj->gpu_addr, + packet_buff_uint, + ib_size); + + if (status != 0) { + pr_debug("Error! kfd: In func %s >> failed to submit DIQ packet\n", __func__); + break; + } - if (status != 0) { - pr_err("amdkfd: Failed to submit IB to DIQ\n"); - break; } - } - kfd_gtt_sa_free(dbgdev->dev, mem_obj); + } while (false); + if (packet_buff_uint != NULL) + kfd_gtt_sa_free(dbgdev->dev, mem_obj); + return status; + } static int dbgdev_wave_control_set_registers( struct dbg_wave_control_info *wac_info, union SQ_CMD_BITS *in_reg_sq_cmd, - union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) + union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, + unsigned int asic_family) { int status = 0; union SQ_CMD_BITS reg_sq_cmd; union GRBM_GFX_INDEX_BITS reg_gfx_index; - struct HsaDbgWaveMsgAMDGen2 *pMsg; - - BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index); reg_sq_cmd.u32All = 0; + reg_gfx_index.u32All = 0; - pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; switch (wac_info->mode) { - /* Send command to single wave */ - case HSA_DBG_WAVEMODE_SINGLE: - /* - * Limit access to the process waves only, - * by setting vmid check - */ + case HSA_DBG_WAVEMODE_SINGLE: /* Send command to single wave */ + /*limit access to the process waves only,by setting vmid check */ reg_sq_cmd.bits.check_vmid = 1; - reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD; - reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId; + reg_sq_cmd.bits.simd_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.SIMD; + reg_sq_cmd.bits.wave_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.WaveId; reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE; - reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; - reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; - reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; + reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray; + reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine; + reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU; break; - /* Send command to all waves with matching VMID */ - case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: + case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: /* Send command to all waves with matching VMID */ + reg_gfx_index.bits.sh_broadcast_writes = 1; reg_gfx_index.bits.se_broadcast_writes = 1; reg_gfx_index.bits.instance_broadcast_writes = 1; reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; - break; - /* Send command to all CU waves with matching VMID */ - case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: + case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: /* Send command to all CU waves with matching VMID */ reg_sq_cmd.bits.check_vmid = 1; reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; - reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; - reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; - reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; + reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray; + reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine; + reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU; break; default: - return -EINVAL; + status = -EINVAL; + break; } switch (wac_info->operand) { case HSA_DBG_WAVEOP_HALT: - reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; + if (asic_family == CHIP_KAVERI) { + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; + pr_debug("kfd:dbgdev: halting KV\n"); + } else { + reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; + reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; + pr_debug("kfd:dbgdev: halting CZ\n"); + } break; case HSA_DBG_WAVEOP_RESUME: - reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; + if (asic_family == CHIP_KAVERI) { + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; + pr_debug("kfd:dbgdev: resuming KV\n"); + } else { + reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; + reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; + pr_debug("kfd:dbgdev: resuming CZ\n"); + } break; case HSA_DBG_WAVEOP_KILL: @@ -601,128 +608,114 @@ static int dbgdev_wave_control_set_registers( } if (status == 0) { - *in_reg_sq_cmd = reg_sq_cmd; + *in_reg_sq_cmd = reg_sq_cmd; *in_reg_gfx_index = reg_gfx_index; } - return status; + } static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, struct dbg_wave_control_info *wac_info) { - int status; + int status = 0; union SQ_CMD_BITS reg_sq_cmd; union GRBM_GFX_INDEX_BITS reg_gfx_index; struct kfd_mem_obj *mem_obj; - uint32_t *packet_buff_uint; - struct pm4__set_config_reg *packets_vec; + uint32_t *packet_buff_uint = NULL; + struct pm4__set_config_reg *packets_vec = NULL; size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; - BUG_ON(!dbgdev || !wac_info); - reg_sq_cmd.u32All = 0; + do { - status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, - ®_gfx_index); - if (status) { - pr_err("amdkfd: Failed to set wave control registers\n"); - return status; - } - - /* we do not control the VMID in DIQ,so reset it to a known value */ - reg_sq_cmd.bits.vm_id = 0; - - pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); - - pr_debug("\t\t mode is: %u\n", wac_info->mode); - pr_debug("\t\t operand is: %u\n", wac_info->operand); - pr_debug("\t\t trap id is: %u\n", wac_info->trapId); - pr_debug("\t\t msg value is: %u\n", - wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); - pr_debug("\t\t vmid is: N/A\n"); - - pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); - pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); - pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); - pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); - pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); - pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); - pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); - - pr_debug("\t\t ibw is : %u\n", - reg_gfx_index.bitfields.instance_broadcast_writes); - pr_debug("\t\t ii is : %u\n", - reg_gfx_index.bitfields.instance_index); - pr_debug("\t\t sebw is : %u\n", - reg_gfx_index.bitfields.se_broadcast_writes); - pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); - pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); - pr_debug("\t\t sbw is : %u\n", - reg_gfx_index.bitfields.sh_broadcast_writes); - - pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); - - status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); - - if (status != 0) { - pr_err("amdkfd: Failed to allocate GART memory\n"); - return status; - } - - packet_buff_uint = mem_obj->cpu_ptr; + status = dbgdev_wave_control_set_registers(wac_info, + ®_sq_cmd, + ®_gfx_index, + dbgdev->dev->device_info->asic_family); - memset(packet_buff_uint, 0, ib_size); + /* we do not control the VMID in DIQ,so reset it to a known value */ + reg_sq_cmd.bits.vm_id = 0; + if (status != 0) + break; + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + pr_debug("\t\t mode is: %u\n", wac_info->mode); + pr_debug("\t\t operand is: %u\n", wac_info->operand); + pr_debug("\t\t trap id is: %u\n", wac_info->trapId); + pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); + pr_debug("\t\t vmid is: N/A\n"); + + pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); + pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); + pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); + pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); + pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); + pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); + pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); + + pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes); + pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index); + pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes); + pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); + pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); + pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes); + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + + if (status != 0) + break; - packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; - packets_vec[0].header.count = 1; - packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; - packets_vec[0].header.type = PM4_TYPE_3; - packets_vec[0].bitfields2.reg_offset = - GRBM_GFX_INDEX / (sizeof(uint32_t)) - - USERCONFIG_REG_BASE; + packet_buff_uint = mem_obj->cpu_ptr; - packets_vec[0].bitfields2.insert_vmid = 0; - packets_vec[0].reg_data[0] = reg_gfx_index.u32All; + memset(packet_buff_uint, 0, ib_size); - packets_vec[1].header.count = 1; - packets_vec[1].header.opcode = IT_SET_CONFIG_REG; - packets_vec[1].header.type = PM4_TYPE_3; - packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - - AMD_CONFIG_REG_BASE; + packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; + packets_vec[0].header.count = 1; + packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; + packets_vec[0].header.type = PM4_TYPE_3; + packets_vec[0].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE; + packets_vec[0].bitfields2.insert_vmid = 0; + packets_vec[0].reg_data[0] = reg_gfx_index.u32All; - packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; - packets_vec[1].bitfields2.insert_vmid = 1; - packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; + packets_vec[1].header.count = 1; + packets_vec[1].header.opcode = IT_SET_CONFIG_REG; + packets_vec[1].header.type = PM4_TYPE_3; + packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - CONFIG_REG_BASE; + packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; + packets_vec[1].bitfields2.insert_vmid = 1; + packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; - /* Restore the GRBM_GFX_INDEX register */ + /* Restore the GRBM_GFX_INDEX register */ - reg_gfx_index.u32All = 0; - reg_gfx_index.bits.sh_broadcast_writes = 1; - reg_gfx_index.bits.instance_broadcast_writes = 1; - reg_gfx_index.bits.se_broadcast_writes = 1; + reg_gfx_index.u32All = 0; + reg_gfx_index.bits.sh_broadcast_writes = 1; + reg_gfx_index.bits.instance_broadcast_writes = 1; + reg_gfx_index.bits.se_broadcast_writes = 1; - packets_vec[2].ordinal1 = packets_vec[0].ordinal1; - packets_vec[2].bitfields2.reg_offset = - GRBM_GFX_INDEX / (sizeof(uint32_t)) - - USERCONFIG_REG_BASE; + packets_vec[2].ordinal1 = packets_vec[0].ordinal1; + packets_vec[2].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE; + packets_vec[2].bitfields2.insert_vmid = 0; + packets_vec[2].reg_data[0] = reg_gfx_index.u32All; - packets_vec[2].bitfields2.insert_vmid = 0; - packets_vec[2].reg_data[0] = reg_gfx_index.u32All; + status = dbgdev_diq_submit_ib( + dbgdev, + wac_info->process->pasid, + mem_obj->gpu_addr, + packet_buff_uint, + ib_size); - status = dbgdev_diq_submit_ib( - dbgdev, - wac_info->process->pasid, - mem_obj->gpu_addr, - packet_buff_uint, - ib_size); + if (status != 0) + pr_debug("%s\n", " Critical Error ! Submit diq packet failed "); - if (status != 0) - pr_err("amdkfd: Failed to submit IB to DIQ\n"); + } while (false); - kfd_gtt_sa_free(dbgdev->dev, mem_obj); + if (packet_buff_uint != NULL) + kfd_gtt_sa_free(dbgdev->dev, mem_obj); return status; } @@ -730,66 +723,69 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, struct dbg_wave_control_info *wac_info) { - int status; + int status = 0; + unsigned int vmid = 0xffff; union SQ_CMD_BITS reg_sq_cmd; union GRBM_GFX_INDEX_BITS reg_gfx_index; - struct kfd_process_device *pdd; - BUG_ON(!dbgdev || !dbgdev->dev || !wac_info); + struct kfd_process_device *pdd = NULL; reg_sq_cmd.u32All = 0; + status = 0; /* taking the VMID for that process on the safe way using PDD */ pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); - if (!pdd) { - pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); - return -EFAULT; - } - status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, - ®_gfx_index); - if (status) { - pr_err("amdkfd: Failed to set wave control registers\n"); - return status; + if (pdd) { + status = dbgdev_wave_control_set_registers(wac_info, + ®_sq_cmd, + ®_gfx_index, + dbgdev->dev->device_info->asic_family); + if (status == 0) { + + /* for non DIQ we need to patch the VMID: */ + + vmid = pdd->qpd.vmid; + reg_sq_cmd.bits.vm_id = vmid; + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + pr_debug("\t\t mode is: %u\n", wac_info->mode); + pr_debug("\t\t operand is: %u\n", wac_info->operand); + pr_debug("\t\t trap id is: %u\n", wac_info->trapId); + pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); + pr_debug("\t\t vmid is: %u\n", vmid); + + pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); + pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); + pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); + pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); + pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); + pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); + pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); + + pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes); + pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index); + pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes); + pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); + pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); + pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes); + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + dbgdev->dev->kfd2kgd + ->wave_control_execute(dbgdev->dev->kgd, + reg_gfx_index.u32All, + reg_sq_cmd.u32All); + } else { + status = -EINVAL; + } + } else { + status = -EFAULT; } - /* for non DIQ we need to patch the VMID: */ + return status; - reg_sq_cmd.bits.vm_id = pdd->qpd.vmid; - - pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); - - pr_debug("\t\t mode is: %u\n", wac_info->mode); - pr_debug("\t\t operand is: %u\n", wac_info->operand); - pr_debug("\t\t trap id is: %u\n", wac_info->trapId); - pr_debug("\t\t msg value is: %u\n", - wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); - pr_debug("\t\t vmid is: %u\n", pdd->qpd.vmid); - - pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); - pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); - pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); - pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); - pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); - pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); - pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); - - pr_debug("\t\t ibw is : %u\n", - reg_gfx_index.bitfields.instance_broadcast_writes); - pr_debug("\t\t ii is : %u\n", - reg_gfx_index.bitfields.instance_index); - pr_debug("\t\t sebw is : %u\n", - reg_gfx_index.bitfields.se_broadcast_writes); - pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); - pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); - pr_debug("\t\t sbw is : %u\n", - reg_gfx_index.bitfields.sh_broadcast_writes); - - pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); - - return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd, - reg_gfx_index.u32All, - reg_sq_cmd.u32All); } int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) @@ -800,13 +796,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) union GRBM_GFX_INDEX_BITS reg_gfx_index; struct kfd_process_device *pdd; struct dbg_wave_control_info wac_info; - int temp; - int first_vmid_to_scan = 8; - int last_vmid_to_scan = 15; - - first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1; - temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan; - last_vmid_to_scan = first_vmid_to_scan + ffz(temp); + int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; + int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; reg_sq_cmd.u32All = 0; status = 0; @@ -823,7 +814,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid (dev->kgd, vmid)) { - if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid + if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid (dev->kgd, vmid) == p->pasid) { pr_debug("Killing wave fronts of vmid %d and pasid %d\n", vmid, p->pasid); @@ -833,7 +824,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) } if (vmid > last_vmid_to_scan) { - pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid); + pr_err("amdkfd: didn't find vmid for pasid (%d)\n", p->pasid); return -EFAULT; } @@ -843,7 +834,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) return -EFAULT; status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, - ®_gfx_index); + ®_gfx_index, dev->device_info->asic_family); if (status != 0) return -EINVAL; @@ -858,15 +849,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) } void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, - enum DBGDEV_TYPE type) + DBGDEV_TYPE type) { - BUG_ON(!pdbgdev || !pdev); - pdbgdev->dev = pdev; pdbgdev->kq = NULL; pdbgdev->type = type; pdbgdev->pqm = NULL; - switch (type) { case DBGDEV_TYPE_NODIQ: pdbgdev->dbgdev_register = dbgdev_register_nodiq; @@ -876,10 +864,12 @@ void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, break; case DBGDEV_TYPE_DIQ: default: + pdbgdev->dbgdev_register = dbgdev_register_diq; pdbgdev->dbgdev_unregister = dbgdev_unregister_diq; pdbgdev->dbgdev_wave_control = dbgdev_wave_control_diq; pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq; + break; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h index 03424c20920c..82f48ff3bf9a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h @@ -23,6 +23,10 @@ #ifndef KFD_DBGDEV_H_ #define KFD_DBGDEV_H_ +/* + * SQ_IND_CMD_CMD enum + */ + enum { SQ_CMD_VMID_OFFSET = 28, ADDRESS_WATCH_CNTL_OFFSET = 24 @@ -48,9 +52,9 @@ enum { /* CONFIG reg space definition */ enum { - AMD_CONFIG_REG_BASE = 0x2000, /* in dwords */ - AMD_CONFIG_REG_END = 0x2B00, - AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE + CONFIG_REG_BASE = 0x2000, /* in dwords */ + CONFIG_REG_END = 0x2B00, + CONFIG_REG_SIZE = CONFIG_REG_END - CONFIG_REG_BASE }; /* SH reg space definition */ @@ -60,22 +64,43 @@ enum { SH_REG_SIZE = SH_REG_END - SH_REG_BASE }; +/* SQ_CMD definitions */ + +enum { + SQ_IND_CMD_DATA_RESUME = 0, + SQ_IND_CMD_DATA_HALT = 1 +}; + +enum SQ_IND_CMD_NEW { + SQ_IND_CMD_NEW_NULL = 0x00000000, + SQ_IND_CMD_NEW_SETHALT = 0x00000001, + SQ_IND_CMD_NEW_SAVECTX = 0x00000002, + SQ_IND_CMD_NEW_KILL = 0x00000003, + SQ_IND_CMD_NEW_DEBUG = 0x00000004, + SQ_IND_CMD_NEW_TRAP = 0x00000005, + SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 + +}; + enum SQ_IND_CMD_CMD { SQ_IND_CMD_CMD_NULL = 0x00000000, SQ_IND_CMD_CMD_HALT = 0x00000001, SQ_IND_CMD_CMD_RESUME = 0x00000002, SQ_IND_CMD_CMD_KILL = 0x00000003, SQ_IND_CMD_CMD_DEBUG = 0x00000004, - SQ_IND_CMD_CMD_TRAP = 0x00000005, + SQ_IND_CMD_CMD_TRAP = 0x00000005 }; +/* + * SQ_IND_CMD_MODE enum + */ -enum SQ_IND_CMD_MODE { +typedef enum SQ_IND_CMD_MODE { SQ_IND_CMD_MODE_SINGLE = 0x00000000, SQ_IND_CMD_MODE_BROADCAST = 0x00000001, SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002, SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003, SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004, -}; +} SQ_IND_CMD_MODE; union SQ_IND_INDEX_BITS { struct { @@ -106,18 +131,32 @@ union SQ_IND_CMD_BITS { union SQ_CMD_BITS { struct { uint32_t cmd:3; - uint32_t:1; + uint32_t:1; uint32_t mode:3; uint32_t check_vmid:1; uint32_t trap_id:3; - uint32_t:5; + uint32_t:5; uint32_t wave_id:4; uint32_t simd_id:2; - uint32_t:2; + uint32_t:2; uint32_t queue_id:3; - uint32_t:1; + uint32_t:1; uint32_t vm_id:4; } bitfields, bits; + struct { + uint32_t cmd:3; + uint32_t:1; + uint32_t mode:3; + uint32_t check_vmid:1; + uint32_t data:3; + uint32_t:5; + uint32_t wave_id:4; + uint32_t simd_id:2; + uint32_t:2; + uint32_t queue_id:3; + uint32_t:1; + uint32_t vm_id:4; + } bitfields_sethalt, bits_sethalt; uint32_t u32All; signed int i32All; float f32All; @@ -169,7 +208,7 @@ union TCP_WATCH_ADDR_L_BITS { }; enum { - QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */ + QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */ QUEUESTATE__ACTIVE_COMPLETION_PENDING, QUEUESTATE__ACTIVE }; @@ -187,7 +226,6 @@ union ULARGE_INTEGER { #define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8)) -void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, - enum DBGDEV_TYPE type); +void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, DBGDEV_TYPE type); -#endif /* KFD_DBGDEV_H_ */ +#endif /* KFD_DBGDEV_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c index 56d676396342..5d269ea94957 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c @@ -36,42 +36,50 @@ static DEFINE_MUTEX(kfd_dbgmgr_mutex); -struct mutex *kfd_get_dbgmgr_mutex(void) +struct mutex * +get_dbgmgr_mutex(void) { return &kfd_dbgmgr_mutex; } +/*===========================================================================*/ -static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) +static void +kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) { - BUG_ON(!pmgr); - kfree(pmgr->dbgdev); - pmgr->dbgdev = NULL; pmgr->pasid = 0; pmgr->dev = NULL; } -void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) +/*===========================================================================*/ + +void +kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) { if (pmgr != NULL) { kfd_dbgmgr_uninitialize(pmgr); kfree(pmgr); + pmgr = NULL; } } -bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) +/*===========================================================================*/ + +bool +kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) { - enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; + DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; struct kfd_dbgmgr *new_buff; BUG_ON(pdev == NULL); BUG_ON(!pdev->init_complete); new_buff = kfd_alloc_struct(new_buff); - if (!new_buff) { - pr_err("amdkfd: Failed to allocate dbgmgr instance\n"); + if (!new_buff) + { + dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgmgr instance\n", __func__); return false; } @@ -79,7 +87,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) new_buff->dev = pdev; new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev); if (!new_buff->dbgdev) { - pr_err("amdkfd: Failed to allocate dbgdev instance\n"); + dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgdev\n", __func__); kfree(new_buff); return false; } @@ -94,75 +102,200 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) return true; } -long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) +/*===========================================================================*/ + +long +kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) { - BUG_ON(!p || !pmgr || !pmgr->dbgdev); + long status = 0; - if (pmgr->pasid != 0) { - pr_debug("H/W debugger is already active using pasid %d\n", - pmgr->pasid); - return -EBUSY; - } + do { + + if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) { + dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); + /* Invalid Pointer. */ + status = -EINVAL; + break; + } + if (pmgr->pasid != 0) { + /* HW debugger is already active. */ + status = -EBUSY; + break; + } + + /* remember pasid */ + + pmgr->pasid = p->pasid; + + /* provide the pqm for diq generation */ - /* remember pasid */ - pmgr->pasid = p->pasid; + pmgr->dbgdev->pqm = &p->pqm; - /* provide the pqm for diq generation */ - pmgr->dbgdev->pqm = &p->pqm; + /* activate the actual registering */ + /* todo: you should lock with the process mutex here */ + pmgr->dbgdev->dbgdev_register(pmgr->dbgdev); + /* todo: you should unlock with the process mutex here */ - /* activate the actual registering */ - pmgr->dbgdev->dbgdev_register(pmgr->dbgdev); + } while (false); - return 0; + return status; } -long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) +/* ========================================================================== */ + +long +kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) { - BUG_ON(!p || !pmgr || !pmgr->dbgdev); - /* Is the requests coming from the already registered process? */ - if (pmgr->pasid != p->pasid) { - pr_debug("H/W debugger is not registered by calling pasid %d\n", - p->pasid); - return -EINVAL; - } + long status = 0; - pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev); + do { - pmgr->pasid = 0; + if ((pmgr == NULL) || (pmgr->dev == NULL) + || (pmgr->dbgdev == NULL) || (p == NULL)) { + dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); + /* Invalid Pointer */ + status = -EINVAL; + break; + } + if (pmgr->pasid != p->pasid) { + /* Is the requests coming from the already registered process? */ + status = -EINVAL; + break; + } + + /* todo: you should lock with the process mutex here */ + + pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev); - return 0; + /* todo: you should unlock with the process mutex here */ + + pmgr->pasid = 0; + + } while (false); + + return status; } -long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, - struct dbg_wave_control_info *wac_info) +/* =========================================================================== */ + +long +kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info) { - BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info); + long status = 0; - /* Is the requests coming from the already registered process? */ - if (pmgr->pasid != wac_info->process->pasid) { - pr_debug("H/W debugger support was not registered for requester pasid %d\n", - wac_info->process->pasid); - return -EINVAL; - } + dev_info(NULL, "kfd: In func %s\n", __func__); + + do { + + if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (wac_info == NULL) + || (wac_info->process == NULL)) { + /* Invalid Pointer */ + dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); + status = -EINVAL; + break; + } + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != wac_info->process->pasid) { + /* HW debugger support was not registered for requester process */ + status = -EINVAL; + break; + } + + status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); + + } while (false); + + return status; - return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); } -long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, - struct dbg_address_watch_info *adw_info) +/* =========================================================================== */ + +long +kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info) { - BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info); + long status = 0; + dev_info(NULL, "kfd: In func %s\n", __func__); - /* Is the requests coming from the already registered process? */ - if (pmgr->pasid != adw_info->process->pasid) { - pr_debug("H/W debugger support was not registered for requester pasid %d\n", - adw_info->process->pasid); - return -EINVAL; - } + do { + + if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (adw_info == NULL) + || (adw_info->process == NULL)) { + /* Invalid Pointer */ + dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); + status = -EINVAL; + break; + } + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != adw_info->process->pasid) { + /* HW debugger support was not registered for requester process */ + status = -EINVAL; + break; + } + + status = (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, adw_info); + + } while (false); + + return status; - return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, - adw_info); } + +/* =========================================================================== */ +/* + * Handle abnormal process termination + * if we are in the midst of a debug session, we should kill all pending waves + * of the debugged process and unregister the process from the Debugger. + */ +long +kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process) +{ + long status = 0; + struct dbg_wave_control_info wac_info; + + dev_info(NULL, "kfd: In func %s\n", __func__); + + do { + + if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) { + /* Invalid Pointer */ + dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__); + status = -EINVAL; + break; + } + /* first, we kill all the wavefronts of this process */ + + wac_info.process = process; + wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS; + wac_info.operand = HSA_DBG_WAVEOP_KILL; + wac_info.trapId = 0x0; /* not used for the KILL */ + wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = 0; /* not used for kill */ + wac_info.dbgWave_msg.MemoryVA = NULL; /* not used for kill */ + + status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, &wac_info); + + if (status != 0) { + dev_info(NULL, "Error! kfd: In func %s: wave control failed, status is: %ld\n", __func__, status); + break; + } + if (pmgr->pasid == wac_info.process->pasid) { + /* if terminated process was registered for debug, then unregister it */ + status = kfd_dbgmgr_unregister(pmgr, process); + pmgr->pasid = 0; + } + if (status != 0) + dev_info(NULL, + "Error! kfd: In func %s: unregister failed, status is: %ld debugger can not be reused\n", + __func__, status); + + } while (false); + + return status; + +} + + +/*///////////////////////////////////////////////////////////////////////////////////////// */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h index 257a745ad0b5..2b6484ee8d16 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h @@ -26,252 +26,242 @@ #include "kfd_priv.h" -/* must align with hsakmttypes definition */ +/* + * SQ_IND_CMD_CMD enum + */ + + +/* must align with hsakmttypes definition. */ #pragma pack(push, 4) -enum HSA_DBG_WAVEOP { - HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ - HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ - HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ - HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter - debug mode */ - HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take - a trap */ +typedef enum _HSA_DBG_WAVEOP { + HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ + HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ + HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ + HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter debug mode */ + HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */ HSA_DBG_NUM_WAVEOP = 5, HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF -}; +} HSA_DBG_WAVEOP; -enum HSA_DBG_WAVEMODE { - /* send command to a single wave */ - HSA_DBG_WAVEMODE_SINGLE = 0, - /* - * Broadcast to all wavefronts of all processes is not - * supported for HSA user mode - */ - - /* send to waves within current process */ - HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, - /* send to waves within current process on CU */ - HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, +typedef enum _HSA_DBG_WAVEMODE { + HSA_DBG_WAVEMODE_SINGLE = 0, /* send command to a single wave */ + /* Broadcast to all wavefronts of all processes is not supported for HSA user mode */ + HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, /* send to waves within current process */ + HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, /* send to waves within current process on CU */ HSA_DBG_NUM_WAVEMODE = 3, HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF -}; +} HSA_DBG_WAVEMODE; -enum HSA_DBG_WAVEMSG_TYPE { +typedef enum _HSA_DBG_WAVEMSG_TYPE { HSA_DBG_WAVEMSG_AUTO = 0, HSA_DBG_WAVEMSG_USER = 1, HSA_DBG_WAVEMSG_ERROR = 2, HSA_DBG_NUM_WAVEMSG, HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF -}; +} HSA_DBG_WAVEMSG_TYPE; -enum HSA_DBG_WATCH_MODE { - HSA_DBG_WATCH_READ = 0, /* Read operations only */ - HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */ - HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */ - HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */ +typedef enum _HSA_DBG_WATCH_MODE { + HSA_DBG_WATCH_READ = 0, /* Read operations only */ + HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */ + HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */ + HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */ HSA_DBG_WATCH_NUM, HSA_DBG_WATCH_SIZE = 0xFFFFFFFF -}; +} HSA_DBG_WATCH_MODE; /* This structure is hardware specific and may change in the future */ -struct HsaDbgWaveMsgAMDGen2 { +typedef struct _HsaDbgWaveMsgAMDGen2 { union { - struct ui32 { - uint32_t UserData:8; /* user data */ - uint32_t ShaderArray:1; /* Shader array */ - uint32_t Priv:1; /* Privileged */ - uint32_t Reserved0:4; /* This field is reserved, - should be 0 */ - uint32_t WaveId:4; /* wave id */ - uint32_t SIMD:2; /* SIMD id */ - uint32_t HSACU:4; /* Compute unit */ - uint32_t ShaderEngine:2;/* Shader engine */ - uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ - uint32_t Reserved1:4; /* This field is reserved, - should be 0 */ + struct { + uint32_t UserData:8; /* user data */ + uint32_t ShaderArray:1; /* Shader array */ + uint32_t Priv:1; /* Privileged */ + uint32_t Reserved0:4; /* This field is reserved, should be 0 */ + uint32_t WaveId:4; /* wave id */ + uint32_t SIMD:2; /* SIMD id */ + uint32_t HSACU:4; /* Compute unit */ + uint32_t ShaderEngine:2; /* Shader engine */ + uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ + uint32_t Reserved1:4; /* This field is reserved, should be 0 */ } ui32; uint32_t Value; }; - uint32_t Reserved2; -}; -union HsaDbgWaveMessageAMD { - struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; - /* for future HsaDbgWaveMsgAMDGen3; */ -}; - -struct HsaDbgWaveMessage { - void *MemoryVA; /* ptr to associated host-accessible data */ - union HsaDbgWaveMessageAMD DbgWaveMsg; -}; + uint32_t Reserved2; -/* - * TODO: This definitions to be MOVED to kfd_event, once it is implemented. - * - * HSA sync primitive, Event and HW Exception notification API definitions. - * The API functions allow the runtime to define a so-called sync-primitive, - * a SW object combining a user-mode provided "syncvar" and a scheduler event - * that can be signaled through a defined GPU interrupt. A syncvar is - * a process virtual memory location of a certain size that can be accessed - * by CPU and GPU shader code within the process to set and query the content - * within that memory. The definition of the content is determined by the HSA - * runtime and potentially GPU shader code interfacing with the HSA runtime. - * The syncvar values may be commonly written through an PM4 WRITE_DATA packet - * in the user mode instruction stream. The OS scheduler event is typically - * associated and signaled by an interrupt issued by the GPU, but other HSA - * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced - * by the KFD by this mechanism, too. */ - -/* these are the new definitions for events */ -enum HSA_EVENTTYPE { - HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */ - HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ - HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change - (start/stop) */ - HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ - HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ - HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ - HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */ - HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state - (EOP pm4) */ +} HsaDbgWaveMsgAMDGen2; + +typedef union _HsaDbgWaveMessageAMD { + HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; + /* for future HsaDbgWaveMsgAMDGen3; */ +} HsaDbgWaveMessageAMD; + +typedef struct _HsaDbgWaveMessage { + void *MemoryVA; /* ptr to associated host-accessible data */ + HsaDbgWaveMessageAMD DbgWaveMsg; +} HsaDbgWaveMessage; + +/* TODO: This definitions to be MOVED to kfd_event, once it is implemented. + + HSA sync primitive, Event and HW Exception notification API definitions + The API functions allow the runtime to define a so-called sync-primitive, a SW object + combining a user-mode provided "syncvar" and a scheduler event that can be signaled + through a defined GPU interrupt. A syncvar is a process virtual memory location of + a certain size that can be accessed by CPU and GPU shader code within the process to set + and query the content within that memory. The definition of the content is determined by + the HSA runtime and potentially GPU shader code interfacing with the HSA runtime. + The syncvar values may be commonly written through an PM4 WRITE_DATA packet in the + user mode instruction stream. The OS scheduler event is typically associated and + signaled by an interrupt issued by the GPU, but other HSA system interrupt conditions + from other HW (e.g. IOMMUv2) may besurfaced by the KFD by this mechanism, too. */ + +/* these are the new definitions for events */ + +typedef enum _HSA_EVENTTYPE { + HSA_EVENTTYPE_SIGNAL = 0, /* /user-mode generated GPU signal */ + HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ + HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change( start/stop ) */ + HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ + HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ + HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ + HSA_EVENTTYPE_PROFILE_EVENT = 6, /* GPU signal for profiling */ + HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state (EOP pm4) */ /* ... */ HSA_EVENTTYPE_MAXID, HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF -}; +} HSA_EVENTTYPE; + +typedef uint32_t HSA_EVENTID; -/* Sub-definitions for various event types: Syncvar */ -struct HsaSyncVar { - union SyncVar { - void *UserData; /* pointer to user mode data */ - uint64_t UserDataPtrValue; /* 64bit compatibility of value */ +/* Subdefinitions for various event types: Syncvar */ + +typedef struct _HsaSyncVar { + union { + void *UserData; /* pointer to user mode data */ + uint64_t UserDataPtrValue; /* 64bit compatibility of value */ } SyncVar; uint64_t SyncVarSize; -}; +} HsaSyncVar; -/* Sub-definitions for various event types: NodeChange */ +/* + Subdefinitions for various event types: NodeChange +*/ -enum HSA_EVENTTYPE_NODECHANGE_FLAGS { +typedef enum _HSA_EVENTTYPE_NODECHANGE_FLAGS { HSA_EVENTTYPE_NODECHANGE_ADD = 0, HSA_EVENTTYPE_NODECHANGE_REMOVE = 1, HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF -}; +} HSA_EVENTTYPE_NODECHANGE_FLAGS; -struct HsaNodeChange { - /* HSA node added/removed on the platform */ - enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; -}; +typedef struct _HsaNodeChange { + HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; /* HSA node added/removed on the platform */ +} HsaNodeChange; + +/* + Sub-definitions for various event types: DeviceStateChange +*/ -/* Sub-definitions for various event types: DeviceStateChange */ -enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS { - /* device started (and available) */ - HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, - /* device stopped (i.e. unavailable) */ - HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, +typedef enum _HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS { + HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, /* device started (and available) */ + HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, /* device stopped (i.e. unavailable) */ HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF -}; +} HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS; -enum HSA_DEVICE { +typedef enum _HSA_DEVICE { HSA_DEVICE_CPU = 0, HSA_DEVICE_GPU = 1, MAX_HSA_DEVICE = 2 -}; +} HSA_DEVICE; -struct HsaDeviceStateChange { +typedef struct _HsaDeviceStateChange { uint32_t NodeId; /* F-NUMA node that contains the device */ - enum HSA_DEVICE Device; /* device type: GPU or CPU */ - enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */ -}; + HSA_DEVICE Device; /* device type: GPU or CPU */ + HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */ +} HsaDeviceStateChange; -struct HsaEventData { - enum HSA_EVENTTYPE EventType; /* event type */ - union EventData { - /* - * return data associated with HSA_EVENTTYPE_SIGNAL - * and other events - */ - struct HsaSyncVar SyncVar; +typedef struct _HsaEventData { + HSA_EVENTTYPE EventType; /* event type */ + union { + /* return data associated with HSA_EVENTTYPE_SIGNAL and other events */ + HsaSyncVar SyncVar; /* data associated with HSA_EVENTTYPE_NODE_CHANGE */ - struct HsaNodeChange NodeChangeState; + HsaNodeChange NodeChangeState; /* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */ - struct HsaDeviceStateChange DeviceState; + HsaDeviceStateChange DeviceState; } EventData; - /* the following data entries are internal to the KFD & thunk itself */ + /* the following data entries are internal to the KFD & thunk itself. */ - /* internal thunk store for Event data (OsEventHandle) */ - uint64_t HWData1; - /* internal thunk store for Event data (HWAddress) */ - uint64_t HWData2; - /* internal thunk store for Event data (HWData) */ - uint32_t HWData3; -}; + uint64_t HWData1; /* internal thunk store for Event data (OsEventHandle) */ + uint64_t HWData2; /* internal thunk store for Event data (HWAddress) */ + uint32_t HWData3; /* internal thunk store for Event data (HWData) */ +} HsaEventData; -struct HsaEventDescriptor { - /* event type to allocate */ - enum HSA_EVENTTYPE EventType; - /* H-NUMA node containing GPU device that is event source */ - uint32_t NodeId; - /* pointer to user mode syncvar data, syncvar->UserDataPtrValue - * may be NULL - */ - struct HsaSyncVar SyncVar; -}; +typedef struct _HsaEventDescriptor { + HSA_EVENTTYPE EventType; /* event type to allocate */ + uint32_t NodeId; /* H-NUMA node containing GPU device that is event source */ + HsaSyncVar SyncVar; /* pointer to user mode syncvar data, syncvar->UserDataPtrValue may be NULL */ +} HsaEventDescriptor; + +typedef struct _HsaEvent { + HSA_EVENTID EventId; + HsaEventData EventData; +} HsaEvent; -struct HsaEvent { - uint32_t EventId; - struct HsaEventData EventData; -}; #pragma pack(pop) -enum DBGDEV_TYPE { +typedef enum _DBGDEV_TYPE { DBGDEV_TYPE_ILLEGAL = 0, DBGDEV_TYPE_NODIQ = 1, DBGDEV_TYPE_DIQ = 2, DBGDEV_TYPE_TEST = 3 -}; +} DBGDEV_TYPE; struct dbg_address_watch_info { struct kfd_process *process; - enum HSA_DBG_WATCH_MODE *watch_mode; + HSA_DBG_WATCH_MODE *watch_mode; uint64_t *watch_address; uint64_t *watch_mask; - struct HsaEvent *watch_event; + HsaEvent *watch_event; uint32_t num_watch_points; }; struct dbg_wave_control_info { struct kfd_process *process; uint32_t trapId; - enum HSA_DBG_WAVEOP operand; - enum HSA_DBG_WAVEMODE mode; - struct HsaDbgWaveMessage dbgWave_msg; + HSA_DBG_WAVEOP operand; + HSA_DBG_WAVEMODE mode; + HsaDbgWaveMessage dbgWave_msg; }; struct kfd_dbgdev { /* The device that owns this data. */ + struct kfd_dev *dev; /* kernel queue for DIQ */ + struct kernel_queue *kq; /* a pointer to the pqm of the calling process */ + struct process_queue_manager *pqm; /* type of debug device ( DIQ, non DIQ, etc. ) */ - enum DBGDEV_TYPE type; + + DBGDEV_TYPE type; /* virtualized function pointers to device dbg */ + int (*dbgdev_register)(struct kfd_dbgdev *dbgdev); int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev); - int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, - struct dbg_address_watch_info *adw_info); - int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, - struct dbg_wave_control_info *wac_info); + int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, struct dbg_address_watch_info *adw_info); + int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, struct dbg_wave_control_info *wac_info); }; @@ -282,13 +272,12 @@ struct kfd_dbgmgr { }; /* prototypes for debug manager functions */ -struct mutex *kfd_get_dbgmgr_mutex(void); +struct mutex *get_dbgmgr_mutex(void); void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr); bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev); long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p); long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p); -long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, - struct dbg_wave_control_info *wac_info); -long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, - struct dbg_address_watch_info *adw_info); -#endif /* KFD_DBGMGR_H_ */ +long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info); +long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info); +long kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process); +#endif /* KFD_DBGMGR_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 3f95f7cb4019..20592baeaf95 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -24,9 +24,11 @@ #include <linux/bsearch.h> #include <linux/pci.h> #include <linux/slab.h> +#include <linux/highmem.h> #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers.h" +#include "cwsr_trap_handler_carrizo.h" #define MQD_SIZE_ALIGNED 768 @@ -38,7 +40,8 @@ static const struct kfd_device_info kaveri_device_info = { .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .is_need_iommu_device = true }; static const struct kfd_device_info carrizo_device_info = { @@ -49,14 +52,50 @@ static const struct kfd_device_info carrizo_device_info = { .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .is_need_iommu_device = true }; +static const struct kfd_device_info tonga_device_info = { + .asic_family = CHIP_TONGA, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .is_need_iommu_device = false +}; + +static const struct kfd_device_info fiji_device_info = { + .asic_family = CHIP_FIJI, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .is_need_iommu_device = false +} +; struct kfd_deviceid { unsigned short did; const struct kfd_device_info *device_info; }; +/* + * // +// TONGA/AMETHYST device IDs (performance segment) +// +#define DEVICE_ID_VI_TONGA_P_6920 0x6920 // unfused +#define DEVICE_ID_VI_TONGA_P_6921 0x6921 // Amethyst XT +#define DEVICE_ID_VI_TONGA_P_6928 0x6928 // Tonga GL XT +#define DEVICE_ID_VI_TONGA_P_692B 0x692B // Tonga GL PRO +#define DEVICE_ID_VI_TONGA_P_692F 0x692F // Tonga GL PRO VF +#define DEVICE_ID_VI_TONGA_P_6938 0x6938 // Tonga XT +#define DEVICE_ID_VI_TONGA_P_6939 0x6939 // Tonga PRO + * + */ /* Please keep this sorted by increasing device id. */ static const struct kfd_deviceid supported_devices[] = { { 0x1304, &kaveri_device_info }, /* Kaveri */ @@ -85,13 +124,23 @@ static const struct kfd_deviceid supported_devices[] = { { 0x9874, &carrizo_device_info }, /* Carrizo */ { 0x9875, &carrizo_device_info }, /* Carrizo */ { 0x9876, &carrizo_device_info }, /* Carrizo */ - { 0x9877, &carrizo_device_info } /* Carrizo */ + { 0x9877, &carrizo_device_info }, /* Carrizo */ + { 0x6920, &tonga_device_info }, /* Tonga */ + { 0x6921, &tonga_device_info }, /* Tonga */ + { 0x6928, &tonga_device_info }, /* Tonga */ + { 0x692B, &tonga_device_info }, /* Tonga */ + { 0x692F, &tonga_device_info }, /* Tonga */ + { 0x6938, &tonga_device_info }, /* Tonga */ + { 0x6939, &tonga_device_info }, /* Tonga */ + { 0x7300, &fiji_device_info } /* Fiji */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size); static void kfd_gtt_sa_fini(struct kfd_dev *kfd); +static int kfd_resume(struct kfd_dev *kfd); + static const struct kfd_device_info *lookup_device_info(unsigned short did) { size_t i; @@ -117,6 +166,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, if (!device_info) return NULL; + BUG_ON(!f2g); + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); if (!kfd) return NULL; @@ -170,15 +221,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd) pasid_limit, kfd->doorbell_process_limit - 1); - err = amd_iommu_init_device(kfd->pdev, pasid_limit); - if (err < 0) { - dev_err(kfd_device, "error initializing iommu device\n"); - return false; - } - if (!kfd_set_pasid_limit(pasid_limit)) { dev_err(kfd_device, "error setting pasid limit\n"); - amd_iommu_free_device(kfd->pdev); return false; } @@ -219,13 +263,81 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, return AMD_IOMMU_INV_PRI_RSP_INVALID; } +static int kfd_cwsr_init(struct kfd_dev *kfd) +{ + /* + * Initialize the CWSR required memory for TBA and TMA + * only support CWSR on VI and up with FW version >=625. + */ + if (cwsr_enable && + (kfd->mec_fw_version >= KFD_CWSR_CZ_FW_VER)) { + void *cwsr_addr = NULL; + unsigned int size = sizeof(cwsr_trap_carrizo_hex); + + if (size > PAGE_SIZE) { + pr_err("amdkfd: wrong CWSR ISA size.\n"); + return -EINVAL; + } + kfd->cwsr_size = + ALIGN(size, PAGE_SIZE) + PAGE_SIZE; + kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, + get_order(kfd->cwsr_size)); + if (!kfd->cwsr_pages) { + pr_err("amdkfd: error alloc CWSR isa memory.\n"); + return -ENOMEM; + } + /*Only first page used for cwsr ISA code */ + cwsr_addr = kmap(kfd->cwsr_pages); + memset(cwsr_addr, 0, PAGE_SIZE); + memcpy(cwsr_addr, cwsr_trap_carrizo_hex, size); + kunmap(kfd->cwsr_pages); + kfd->tma_offset = ALIGN(size, PAGE_SIZE); + kfd->cwsr_enabled = true; + dev_info(kfd_device, + "Reserved %d pages for cwsr.\n", + (kfd->cwsr_size >> PAGE_SHIFT)); + } + + return 0; +} + +static void kfd_cwsr_fini(struct kfd_dev *kfd) +{ + if (kfd->cwsr_pages) + __free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size)); +} + bool kgd2kfd_device_init(struct kfd_dev *kfd, const struct kgd2kfd_shared_resources *gpu_resources) { unsigned int size; + unsigned int vmid_bitmap_kfd, vmid_num_kfd; + + kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, + KGD_ENGINE_MEC1); kfd->shared_resources = *gpu_resources; + vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap; + kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1; + kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1; + vmid_num_kfd = kfd->vm_info.last_vmid_kfd + - kfd->vm_info.first_vmid_kfd + 1; + kfd->vm_info.vmid_num_kfd = vmid_num_kfd; + + /* If MEC firmware is too old, turn off hws multiple process mapping */ + if (kfd->mec_fw_version < KFD_MULTI_PROC_MAPPING_HWS_SUPPORT) + kfd->max_proc_per_quantum = 0; + /* Verify module parameters regarding mapped process number*/ + else if ((hws_max_conc_proc < 0) + || (hws_max_conc_proc > vmid_num_kfd)) { + dev_err(kfd_device, + "hws_max_conc_proc (%d) must be between 0 and %d, use %d instead\n", + hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd); + kfd->max_proc_per_quantum = vmid_num_kfd; + } else + kfd->max_proc_per_quantum = hws_max_conc_proc; + /* calculate max size of mqds needed for queues */ size = max_num_of_queues_per_device * kfd->device_info->mqd_size_aligned; @@ -280,16 +392,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto kfd_interrupt_error; } - if (!device_iommu_pasid_init(kfd)) { - dev_err(kfd_device, - "Error initializing iommuv2 for device (%x:%x)\n", - kfd->pdev->vendor, kfd->pdev->device); - goto device_iommu_pasid_error; - } - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, - iommu_pasid_shutdown_callback); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); - kfd->dqm = device_queue_manager_init(kfd); if (!kfd->dqm) { dev_err(kfd_device, @@ -298,13 +400,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto device_queue_manager_error; } - if (kfd->dqm->ops.start(kfd->dqm) != 0) { - dev_err(kfd_device, - "Error starting queuen manager for device (%x:%x)\n", - kfd->pdev->vendor, kfd->pdev->device); - goto dqm_start_error; + if (kfd->device_info->is_need_iommu_device) { + if (!device_iommu_pasid_init(kfd)) { + dev_err(kfd_device, + "Error initializing iommuv2 for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto device_iommu_pasid_error; + } } + if (kfd_cwsr_init(kfd)) + goto device_iommu_pasid_error; + + if (kfd_resume(kfd)) + goto kfd_resume_error; + kfd->dbgmgr = NULL; kfd->init_complete = true; @@ -316,11 +426,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto out; -dqm_start_error: +kfd_resume_error: + kfd_cwsr_fini(kfd); +device_iommu_pasid_error: device_queue_manager_uninit(kfd->dqm); device_queue_manager_error: - amd_iommu_free_device(kfd->pdev); -device_iommu_pasid_error: kfd_interrupt_exit(kfd); kfd_interrupt_error: kfd_topology_remove_device(kfd); @@ -338,8 +448,9 @@ out: void kgd2kfd_device_exit(struct kfd_dev *kfd) { if (kfd->init_complete) { + kgd2kfd_suspend(kfd); + kfd_cwsr_fini(kfd); device_queue_manager_uninit(kfd->dqm); - amd_iommu_free_device(kfd->pdev); kfd_interrupt_exit(kfd); kfd_topology_remove_device(kfd); kfd_gtt_sa_fini(kfd); @@ -355,32 +466,68 @@ void kgd2kfd_suspend(struct kfd_dev *kfd) if (kfd->init_complete) { kfd->dqm->ops.stop(kfd->dqm); - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); - amd_iommu_free_device(kfd->pdev); + if (kfd->device_info->is_need_iommu_device) { + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); + } } } -int kgd2kfd_resume(struct kfd_dev *kfd) +int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem) { - unsigned int pasid_limit; - int err; + return evict_bo(dev, mem); +} +int kgd2kfd_restore(struct kfd_dev *kfd) +{ + return restore(kfd); +} + +int kgd2kfd_resume(struct kfd_dev *kfd) +{ BUG_ON(kfd == NULL); - pasid_limit = kfd_get_pasid_limit(); + if (!kfd->init_complete) + return 0; + + return kfd_resume(kfd); + +} + +static int kfd_resume(struct kfd_dev *kfd) +{ + int err = 0; + + if (kfd->device_info->is_need_iommu_device) { + unsigned int pasid_limit = kfd_get_pasid_limit(); - if (kfd->init_complete) { err = amd_iommu_init_device(kfd->pdev, pasid_limit); - if (err < 0) + if (err) return -ENXIO; amd_iommu_set_invalidate_ctx_cb(kfd->pdev, - iommu_pasid_shutdown_callback); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); - kfd->dqm->ops.start(kfd->dqm); + iommu_pasid_shutdown_callback); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, + iommu_invalid_ppr_cb); } - return 0; + err = kfd->dqm->ops.start(kfd->dqm); + if (err) { + dev_err(kfd_device, + "Error starting queue manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto dqm_start_error; + } + + kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0); + + return err; + +dqm_start_error: + if (kfd->device_info->is_need_iommu_device) + amd_iommu_free_device(kfd->pdev); + + return err; } /* This is called directly from KGD at ISR. */ @@ -399,6 +546,58 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) spin_unlock(&kfd->interrupt_lock); } +int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) +{ + struct kfd_process *p; + struct kfd_process_device *pdd; + int r; + + BUG_ON(kfd == NULL); + if (!kfd->init_complete) + return 0; + + /* Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function returns a read-locked process. */ + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ENODEV; + + r = -ENODEV; + pdd = kfd_get_process_device_data(kfd, p); + if (pdd) + r = process_evict_queues(kfd->dqm, &pdd->qpd); + + up_read(&p->lock); + return r; +} + +int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) +{ + struct kfd_process *p; + struct kfd_process_device *pdd; + int r; + + BUG_ON(kfd == NULL); + if (!kfd->init_complete) + return 0; + + /* Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function returns a read-locked process. */ + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ENODEV; + + r = -ENODEV; + pdd = kfd_get_process_device_data(kfd, p); + if (pdd) + r = process_restore_queues(kfd->dqm, &pdd->qpd); + + up_read(&p->lock); + return r; +} + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f49c551195b3..78033c13d2ed 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -44,9 +44,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); -static int destroy_queues_cpsch(struct device_queue_manager *dqm, - bool preempt_static_queues, bool lock); +static int execute_queues_cpsch(struct device_queue_manager *dqm); +static int unmap_queues_cpsch(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset); static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, @@ -100,11 +101,11 @@ static int allocate_vmid(struct device_queue_manager *dqm, if (dqm->vmid_bitmap == 0) return -ENOMEM; - bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); + bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, + dqm->dev->vm_info.vmid_num_kfd); clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); - /* Kaveri kfd vmid's starts from vmid 8 */ - allocated_vmid = bit + KFD_VMID_START_OFFSET; + allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; pr_debug("kfd: vmid allocation %d\n", allocated_vmid); qpd->vmid = allocated_vmid; q->properties.vmid = allocated_vmid; @@ -112,6 +113,11 @@ static int allocate_vmid(struct device_queue_manager *dqm, set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); program_sh_mem_settings(dqm, qpd); + dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, + allocated_vmid, + qpd->page_table_base); + /*invalidate the VM context after pasid and vmid mapping is set up*/ + radeon_flush_tlb(dqm->dev, qpd->pqm->process->pasid); return 0; } @@ -119,7 +125,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { - int bit = qpd->vmid - KFD_VMID_START_OFFSET; + int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; /* Release the vmid mapping */ set_pasid_vmid_mapping(dqm, 0, qpd->vmid); @@ -159,6 +165,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, } *allocated_vmid = qpd->vmid; q->properties.vmid = qpd->vmid; + /* + * Eviction state logic: we only mark active queues as evicted + * to avoid the overhead of restoring inactive queues later + */ + if (qpd->evicted) + q->properties.is_evicted = (q->properties.queue_size > 0 && + q->properties.queue_percent > 0 && + q->properties.queue_address != 0); if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) retval = create_compute_queue_nocpsch(dqm, q, qpd); @@ -261,8 +275,12 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, q->pipe, q->queue); + dqm->dev->kfd2kgd->alloc_memory_of_scratch( + dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, - q->queue, (uint32_t __user *) q->properties.write_ptr); + q->queue, (uint32_t __user *) q->properties.write_ptr, + qpd->page_table_base); if (retval != 0) { deallocate_hqd(dqm, q); mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); @@ -342,34 +360,56 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) { int retval; struct mqd_manager *mqd; + struct kfd_process_device *pdd; + bool prev_active = false; BUG_ON(!dqm || !q || !q->mqd); mutex_lock(&dqm->lock); + + pdd = kfd_get_process_device_data(q->device, q->process); + if (!pdd) { + mutex_unlock(&dqm->lock); + return -ENODEV; + } mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); if (mqd == NULL) { mutex_unlock(&dqm->lock); return -ENOMEM; } + /* + * Eviction state logic: we only mark active queues as evicted + * to avoid the overhead of restoring inactive queues later + */ + if (pdd->qpd.evicted > 0) + q->properties.is_evicted = (q->properties.queue_size > 0 && + q->properties.queue_percent > 0 && + q->properties.queue_address != 0); + /* save previous activity state for counters */ if (q->properties.is_active) prev_active = true; - /* - * - * check active state vs. the previous state - * and modify counter accordingly - */ + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if (sched_policy == KFD_SCHED_POLICY_NO_HWS && + q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, + q->queue, + (uint32_t __user *)q->properties.write_ptr, 0); + /* + * check active state vs. the previous state + * and modify counter accordingly + */ if ((q->properties.is_active) && (!prev_active)) dqm->queue_count++; else if ((!q->properties.is_active) && (prev_active)) dqm->queue_count--; if (sched_policy != KFD_SCHED_POLICY_NO_HWS) - retval = execute_queues_cpsch(dqm, false); + retval = execute_queues_cpsch(dqm); mutex_unlock(&dqm->lock); return retval; @@ -395,15 +435,115 @@ static struct mqd_manager *get_mqd_manager_nocpsch( return mqd; } +int process_evict_queues(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct queue *q, *next; + struct mqd_manager *mqd; + int retval = 0; + + BUG_ON(!dqm || !qpd); + + mutex_lock(&dqm->lock); + if (qpd->evicted++ > 0) { /* already evicted, do nothing */ + mutex_unlock(&dqm->lock); + return 0; + } + /* unactivate all active queues on the qpd */ + list_for_each_entry_safe(q, next, &qpd->queues_list, list) { + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd) { /* should not be here */ + BUG(); + continue; + } + /* if the queue is not active anyway, it is not evicted */ + if (q->properties.is_active == true) + q->properties.is_evicted = true; + + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if (sched_policy == KFD_SCHED_POLICY_NO_HWS && + q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, + q->queue, + (uint32_t __user *)q->properties.write_ptr, 0); + if (q->properties.is_evicted) + dqm->queue_count--; + } + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = execute_queues_cpsch(dqm); + + mutex_unlock(&dqm->lock); + return retval; + +} + +int process_restore_queues(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct queue *q, *next; + struct mqd_manager *mqd; + int retval = 0; + + + BUG_ON(!dqm || !qpd); + + mutex_lock(&dqm->lock); + if (qpd->evicted == 0) { /* already restored, do nothing */ + mutex_unlock(&dqm->lock); + return 0; + } + + if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ + qpd->evicted--; + mutex_unlock(&dqm->lock); + return 0; + } + + /* activate all active queues on the qpd */ + list_for_each_entry_safe(q, next, &qpd->queues_list, list) { + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd) { /* should not be here */ + BUG(); + continue; + } + if (q->properties.is_evicted) { + q->properties.is_evicted = false; + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if (sched_policy == KFD_SCHED_POLICY_NO_HWS && + q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = + mqd->load_mqd( + mqd, + q->mqd, + q->pipe, + q->queue, + (uint32_t __user *)q->properties.write_ptr, + 0); + dqm->queue_count++; + } + } + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = execute_queues_cpsch(dqm); + + if (retval == 0) + qpd->evicted = 0; + mutex_unlock(&dqm->lock); + return retval; + +} + static int register_process_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { + struct kfd_process_device *pdd; struct device_process_node *n; int retval; BUG_ON(!dqm || !qpd); - pr_debug("kfd: In func %s\n", __func__); + pr_debug("In func %s\n", __func__); n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); if (!n) @@ -414,6 +554,11 @@ static int register_process_nocpsch(struct device_queue_manager *dqm, mutex_lock(&dqm->lock); list_add(&n->list, &dqm->queues); + pdd = qpd_to_pdd(qpd); + qpd->page_table_base = + dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); + pr_debug("Retrieved PD address == 0x%08u\n", qpd->page_table_base); + retval = dqm->ops_asic_specific.register_process(dqm, qpd); dqm->processes_count++; @@ -531,10 +676,8 @@ static void init_interrupts(struct device_queue_manager *dqm) BUG_ON(dqm == NULL); for (i = 0 ; i < get_pipes_num(dqm) ; i++) - dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, - i + get_first_pipe(dqm)); + dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i); } - static int init_scheduler(struct device_queue_manager *dqm) { int retval; @@ -570,7 +713,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) for (i = 0; i < get_pipes_num(dqm); i++) dqm->allocated_queues[i] = (1 << QUEUES_PER_PIPE) - 1; - dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; + dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; init_scheduler(dqm); @@ -643,8 +786,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, if (retval != 0) return retval; - q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; - q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; + q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; pr_debug("kfd: sdma id is: %d\n", q->sdma_id); pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); @@ -659,7 +802,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, } retval = mqd->load_mqd(mqd, q->mqd, 0, - 0, NULL); + 0, NULL, 0); if (retval != 0) { deallocate_sdma_queue(dqm, q->sdma_id); mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); @@ -684,8 +827,7 @@ static int set_sched_resources(struct device_queue_manager *dqm) queue_num = get_pipes_num_cpsch() * QUEUES_PER_PIPE; queue_mask = (1 << queue_num) - 1; - res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; - res.vmid_mask <<= KFD_VMID_START_OFFSET; + res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; res.queue_mask = queue_mask << (get_first_pipe(dqm) * QUEUES_PER_PIPE); res.gws_mask = res.oac_mask = res.gds_heap_base = res.gds_heap_size = 0; @@ -712,6 +854,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->queue_count = dqm->processes_count = 0; dqm->sdma_queue_count = 0; dqm->active_runlist = false; + dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; retval = dqm->ops_asic_specific.initialize(dqm); if (retval != 0) goto fail_init_pipelines; @@ -732,7 +875,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; - retval = pm_init(&dqm->packets, dqm); + retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); if (retval != 0) goto fail_packet_manager_init; @@ -759,7 +902,9 @@ static int start_cpsch(struct device_queue_manager *dqm) kfd_bind_process_to_device(dqm->dev, node->qpd->pqm->process); - execute_queues_cpsch(dqm, true); + mutex_lock(&dqm->lock); + execute_queues_cpsch(dqm); + mutex_unlock(&dqm->lock); return 0; fail_allocate_vidmem: @@ -776,7 +921,11 @@ static int stop_cpsch(struct device_queue_manager *dqm) BUG_ON(!dqm); - destroy_queues_cpsch(dqm, true, true); + mutex_lock(&dqm->lock); + + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false); + + mutex_unlock(&dqm->lock); list_for_each_entry(node, &dqm->queues, list) { pdd = qpd_to_pdd(node->qpd); @@ -815,7 +964,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, list_add(&kq->list, &qpd->priv_queue_list); dqm->queue_count++; qpd->is_debug = true; - execute_queues_cpsch(dqm, false); + execute_queues_cpsch(dqm); mutex_unlock(&dqm->lock); return 0; @@ -831,11 +980,11 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, mutex_lock(&dqm->lock); /* here we actually preempt the DIQ */ - destroy_queues_cpsch(dqm, true, false); + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false); list_del(&kq->list); dqm->queue_count--; qpd->is_debug = false; - execute_queues_cpsch(dqm, false); + execute_queues_cpsch(dqm); /* * Unconditionally decrement this counter, regardless of the queue's * type. @@ -846,14 +995,6 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, mutex_unlock(&dqm->lock); } -static void select_sdma_engine_id(struct queue *q) -{ - static int sdma_id; - - q->sdma_id = sdma_id; - sdma_id = (sdma_id + 1) % 2; -} - static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd, int *allocate_vmid) { @@ -876,9 +1017,15 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, goto out; } - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - select_sdma_engine_id(q); - + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (retval != 0) + goto out; + q->properties.sdma_queue_id = + q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = + q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + } mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); @@ -886,8 +1033,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, mutex_unlock(&dqm->lock); return -ENOMEM; } + /* + * Eviction state logic: we only mark active queues as evicted + * to avoid the overhead of restoring inactive queues later + */ + if (qpd->evicted) + q->properties.is_evicted = (q->properties.queue_size > 0 && + q->properties.queue_percent > 0 && + q->properties.queue_address != 0); dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd); + + q->properties.tba_addr = qpd->tba_addr; + q->properties.tma_addr = qpd->tma_addr; retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval != 0) @@ -896,7 +1054,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, list_add(&q->list, &qpd->queues_list); if (q->properties.is_active) { dqm->queue_count++; - retval = execute_queues_cpsch(dqm, false); + retval = execute_queues_cpsch(dqm); } if (q->properties.type == KFD_QUEUE_TYPE_SDMA) @@ -933,20 +1091,20 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, return 0; } -static int destroy_sdma_queues(struct device_queue_manager *dqm, +static int unmap_sdma_queues(struct device_queue_manager *dqm, unsigned int sdma_engine) { return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, - KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, sdma_engine); } -static int destroy_queues_cpsch(struct device_queue_manager *dqm, - bool preempt_static_queues, bool lock) +/* dqm->lock mutex has to be locked before calling this function */ +static int unmap_queues_cpsch(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset) { int retval; - enum kfd_preempt_type_filter preempt_type; - struct kfd_process_device *pdd; BUG_ON(!dqm); @@ -956,23 +1114,21 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm, mutex_lock(&dqm->lock); if (!dqm->active_runlist) goto out; + if (dqm->active_runlist == false) + return retval; pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", dqm->sdma_queue_count); if (dqm->sdma_queue_count > 0) { - destroy_sdma_queues(dqm, 0); - destroy_sdma_queues(dqm, 1); + unmap_sdma_queues(dqm, 0); + unmap_sdma_queues(dqm, 1); } - preempt_type = preempt_static_queues ? - KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES : - KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES; - retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, - preempt_type, 0, false, 0); + filter, filter_param, reset, 0); if (retval != 0) - goto out; + return retval; *dqm->fence_addr = KFD_FENCE_INIT; pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, @@ -981,55 +1137,47 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm, retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); if (retval != 0) { - pdd = kfd_get_process_device_data(dqm->dev, - kfd_get_process(current)); - pdd->reset_wavefronts = true; - goto out; + pr_err("kfd: unmapping queues failed."); + return retval; } + pm_release_ib(&dqm->packets); dqm->active_runlist = false; -out: - if (lock) - mutex_unlock(&dqm->lock); return retval; } -static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) +/* dqm->lock mutex has to be locked before calling this function */ +static int execute_queues_cpsch(struct device_queue_manager *dqm) { int retval; BUG_ON(!dqm); - if (lock) - mutex_lock(&dqm->lock); - - retval = destroy_queues_cpsch(dqm, false, false); + retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + 0, false); if (retval != 0) { pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); - goto out; + return retval; } if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { retval = 0; - goto out; + return retval; } if (dqm->active_runlist) { retval = 0; - goto out; + return retval; } retval = pm_send_runlist(&dqm->packets, &dqm->queues); if (retval != 0) { pr_err("kfd: failed to execute runlist"); - goto out; + return retval; } dqm->active_runlist = true; -out: - if (lock) - mutex_unlock(&dqm->lock); return retval; } @@ -1067,14 +1215,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, goto failed; } - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } list_del(&q->list); if (q->properties.is_active) dqm->queue_count--; - execute_queues_cpsch(dqm, false); + retval = execute_queues_cpsch(dqm); mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); @@ -1088,7 +1238,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, mutex_unlock(&dqm->lock); - return 0; + return retval; failed: failed_try_destroy_debugged_queue: @@ -1172,6 +1322,172 @@ out: return false; } +static int set_trap_handler(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + uint64_t tba_addr, + uint64_t tma_addr) +{ + uint64_t *tma; + + tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset); + tma[0] = tba_addr; + tma[1] = tma_addr; + return 0; +} + + +static int set_page_directory_base(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + uint32_t pd_base; + int retval = 0; + + BUG_ON(!dqm || !qpd); + + mutex_lock(&dqm->lock); + + pdd = qpd_to_pdd(qpd); + + /* Retrieve PD base */ + pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); + + /* If it has not changed, just get out */ + if (qpd->page_table_base == pd_base) + goto out; + + /* Update PD Base in QPD */ + qpd->page_table_base = pd_base; + pr_debug("Updated PD address == 0x%08u\n", pd_base); + + /* + * Preempt queues, destroy runlist and create new runlist. Queues + * will have the update PD base address + */ + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = execute_queues_cpsch(dqm); + +out: + mutex_unlock(&dqm->lock); + + return retval; +} + +static int process_termination_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct queue *q, *next; + struct mqd_manager *mqd; + struct device_process_node *cur, *next_dpn; + + mutex_lock(&dqm->lock); + + /* Clear all user mode queues */ + list_for_each_entry_safe(q, next, &qpd->queues_list, list) { + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } + + list_del(&q->list); + if (q->properties.is_active) + dqm->queue_count--; + + dqm->total_queue_count--; + mqd->destroy_mqd(mqd, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, + q->pipe, q->queue); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); + } + + /* Unregister process */ + list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { + if (qpd == cur->qpd) { + list_del(&cur->list); + kfree(cur); + dqm->processes_count--; + break; + } + } + + mutex_unlock(&dqm->lock); + + return 0; +} + + +static int process_termination_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + int retval; + struct queue *q, *next; + struct kernel_queue *kq, *kq_next; + struct mqd_manager *mqd; + struct device_process_node *cur, *next_dpn; + + retval = 0; + + mutex_lock(&dqm->lock); + + /* Clean all kernel queues */ + list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { + list_del(&kq->list); + dqm->queue_count--; + qpd->is_debug = false; + dqm->total_queue_count--; + } + + /* Clear all user mode queues */ + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } + + if (q->properties.is_active) + dqm->queue_count--; + + dqm->total_queue_count--; + } + + /* Unregister process */ + list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { + if (qpd == cur->qpd) { + list_del(&cur->list); + kfree(cur); + dqm->processes_count--; + break; + } + } + + retval = execute_queues_cpsch(dqm); + + /* lastly, free mqd resources */ + list_for_each_entry_safe(q, next, &qpd->queues_list, list) { + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } + list_del(&q->list); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + } + + mutex_unlock(&dqm->lock); + return retval; +} + struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) { struct device_queue_manager *dqm; @@ -1202,6 +1518,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; + dqm->ops.set_page_directory_base = set_page_directory_base; + dqm->ops.process_termination = process_termination_cpsch; break; case KFD_SCHED_POLICY_NO_HWS: /* initialize dqm for no cp scheduling */ @@ -1216,6 +1535,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.initialize = initialize_nocpsch; dqm->ops.uninitialize = uninitialize_nocpsch; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; + dqm->ops.set_page_directory_base = set_page_directory_base; + dqm->ops.process_termination = process_termination_nocpsch; break; default: BUG(); @@ -1230,6 +1552,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_KAVERI: device_queue_manager_init_cik(&dqm->ops_asic_specific); break; + + case CHIP_TONGA: + case CHIP_FIJI: + device_queue_manager_init_vi_tonga(&dqm->ops_asic_specific); + break; } if (dqm->ops.initialize(dqm) != 0) { @@ -1247,3 +1574,20 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) dqm->ops.uninitialize(dqm); kfree(dqm); } + +int kfd_process_vm_fault(struct device_queue_manager *dqm, + unsigned int pasid) +{ + struct kfd_process_device *pdd; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + int ret = 0; + + if (!p) + return -EINVAL; + pdd = kfd_get_process_device_data(dqm->dev, p); + if (pdd) + ret = process_evict_queues(dqm, &pdd->qpd); + up_read(&p->lock); + + return ret; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index a625b9137da2..19132d980cce 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -29,12 +29,9 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (9000) #define QUEUES_PER_PIPE (8) #define PIPE_PER_ME_CP_SCHEDULING (3) -#define CIK_VMID_NUM (8) -#define KFD_VMID_START_OFFSET (8) -#define VMID_PER_DEVICE CIK_VMID_NUM #define KFD_DQM_FIRST_PIPE (0) #define CIK_SDMA_QUEUES (4) #define CIK_SDMA_QUEUES_PER_ENGINE (2) @@ -81,6 +78,12 @@ struct device_process_node { * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the * memory apertures. * + * @set_page_directory_base: Sets the PD base address (GPU local memory) + * in all the queues of the relevant process running on the specified device. + * It preempts the queues, updates the value and execute the runlist again. + * + * @process_termination: Clears all process queues belongs to that device. + * */ struct device_queue_manager_ops { @@ -124,6 +127,16 @@ struct device_queue_manager_ops { enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); + + int (*set_trap_handler)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + uint64_t tba_addr, + uint64_t tma_addr); + + int (*set_page_directory_base)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + int (*process_termination)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); }; struct device_queue_manager_asic_ops { @@ -180,6 +193,8 @@ struct device_queue_manager { void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops); void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops); +void device_queue_manager_init_vi_tonga( + struct device_queue_manager_asic_ops *ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); int init_pipelines(struct device_queue_manager *dqm, @@ -187,6 +202,12 @@ int init_pipelines(struct device_queue_manager *dqm, unsigned int get_first_pipe(struct device_queue_manager *dqm); unsigned int get_pipes_num(struct device_queue_manager *dqm); +int process_evict_queues(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +int process_restore_queues(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + + static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { return (pdd->lds_base >> 16) & 0xFF; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index c6f435aa803f..fdcd5178a862 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -24,6 +24,7 @@ #include "kfd_device_queue_manager.h" #include "cik_regs.h" #include "oss/oss_2_4_sh_mask.h" +#include "gca/gfx_7_2_sh_mask.h" static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd, @@ -125,6 +126,7 @@ static int register_process_cik(struct device_queue_manager *dqm, } else { temp = get_sh_mem_bases_nybble_64(pdd); qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; } pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index 7e9cae9d349b..c023e50fe027 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -39,6 +39,31 @@ static int initialize_cpsch_vi(struct device_queue_manager *dqm); static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +/* + * Tonga device queue manager functions + */ +static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +static int register_process_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_tonga(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_vi_tonga( + struct device_queue_manager_asic_ops *ops) +{ + ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; + ops->register_process = register_process_vi_tonga; + ops->initialize = initialize_cpsch_vi; + ops->init_sdma_vm = init_sdma_vm_tonga; +} + + void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops) { ops->set_cache_memory_policy = set_cache_memory_policy_vi; @@ -104,6 +129,33 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, return true; } +static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + uint32_t default_mtype; + uint32_t ape1_mtype; + + default_mtype = (default_policy == cache_policy_coherent) ? + MTYPE_UC : + MTYPE_NC_NV; + + ape1_mtype = (alternate_policy == cache_policy_coherent) ? + MTYPE_UC : + MTYPE_NC_NV; + + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | + default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | + ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; + + return true; +} + static int register_process_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -137,6 +189,8 @@ static int register_process_vi(struct device_queue_manager *dqm, qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; + qpd->sh_mem_config |= 1 << + SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; } pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", @@ -145,6 +199,41 @@ static int register_process_vi(struct device_queue_manager *dqm, return 0; } +static int register_process_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + unsigned int temp; + + BUG_ON(!dqm || !qpd); + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | + MTYPE_UC << + SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | + MTYPE_UC << + SH_MEM_CONFIG__APE1_MTYPE__SHIFT; + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + /* On dGPU we're always in GPUVM64 addressing mode with 64-bit + * aperture addresses. */ + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + + pr_debug("kfd: sh_mem_bases nybble: 0x%X and register 0x%X\n", + temp, qpd->sh_mem_bases); + + return 0; +} + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { @@ -161,6 +250,23 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, q->properties.sdma_vm_addr = value; } +static void init_sdma_vm_tonga(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + uint32_t value = 0; + + if (q->process->is_32bit_user_mode) + value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) | + get_sh_mem_bases_32(qpd_to_pdd(qpd)); + else + value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; + q->properties.sdma_vm_addr = value; +} + + static int initialize_cpsch_vi(struct device_queue_manager *dqm) { return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index a7d3cb3fead0..d6a7e2af30f5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -142,13 +142,14 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - pr_debug("mapping doorbell page:\n"); - pr_debug(" target user address == 0x%08llX\n", - (unsigned long long) vma->vm_start); - pr_debug(" physical address == 0x%08llX\n", address); - pr_debug(" vm_flags == 0x%04lX\n", vma->vm_flags); - pr_debug(" size == 0x%04lX\n", - doorbell_process_allocation()); + pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + (unsigned long long) vma->vm_start, address, vma->vm_flags, + doorbell_process_allocation()); + return io_remap_pfn_range(vma, vma->vm_start, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index a6a4b2b1c0d9..335f81e09327 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -32,11 +32,10 @@ #include "kfd_events.h" #include <linux/device.h> -/* - * A task can only be on a single wait_queue at a time, but we need to support +/* A task can only be on a single wait_queue at a time, but we need to support * waiting on multiple events (any/all). - * Instead of each event simply having a wait_queue with sleeping tasks, it - * has a singly-linked list of tasks. + * Instead of each event simply having a wait_queue with sleeping tasks, it has a + * singly-linked list of tasks. * A thread that wants to sleep creates an array of these, one for each event * and adds one to each event's waiter chain. */ @@ -52,12 +51,11 @@ struct kfd_event_waiter { uint32_t input_index; }; -/* - * Over-complicated pooled allocator for event notification slots. +/* Over-complicated pooled allocator for event notification slots. * - * Each signal event needs a 64-bit signal slot where the signaler will write - * a 1 before sending an interrupt.l (This is needed because some interrupts - * do not contain enough spare data bits to identify an event.) + * Each signal event needs a 64-bit signal slot where the signaler will write a 1 + * before sending an interrupt.l (This is needed because some interrupts do not + * contain enough spare data bits to identify an event.) * We get whole pages from vmalloc and map them to the process VA. * Individual signal events are then allocated a slot in a page. */ @@ -65,6 +63,7 @@ struct kfd_event_waiter { struct signal_page { struct list_head event_pages; /* kfd_process.signal_event_pages */ uint64_t *kernel_address; + uint64_t handle; uint64_t __user *user_address; uint32_t page_index; /* Index into the mmap aperture. */ unsigned int free_slots; @@ -74,8 +73,7 @@ struct signal_page { #define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT #define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE) #define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1) -#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \ - SLOT_BITMAP_SIZE * sizeof(long)) +#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + SLOT_BITMAP_SIZE * sizeof(long)) /* * For signal events, the event ID is used as the interrupt user data. @@ -85,23 +83,27 @@ struct signal_page { #define INTERRUPT_DATA_BITS 8 #define SIGNAL_EVENT_ID_SLOT_SHIFT 0 +/* We can only create 8 debug events */ + +#define KFD_DEBUG_EVENT_LIMIT 8 +#define KFD_DEBUG_EVENT_MASK 0x1F +#define KFD_DEBUG_EVENT_SHIFT 5 + static uint64_t *page_slots(struct signal_page *page) { return page->kernel_address; } -static bool allocate_free_slot(struct kfd_process *process, - struct signal_page **out_page, - unsigned int *out_slot_index) +static bool +allocate_free_slot(struct kfd_process *process, + struct signal_page **out_page, + unsigned int *out_slot_index) { struct signal_page *page; list_for_each_entry(page, &process->signal_event_pages, event_pages) { if (page->free_slots > 0) { - unsigned int slot = - find_first_zero_bit(page->used_slot_bitmap, - SLOTS_PER_PAGE); - + unsigned int slot = find_first_zero_bit(page->used_slot_bitmap, SLOTS_PER_PAGE); __set_bit(slot, page->used_slot_bitmap); page->free_slots--; @@ -130,6 +132,8 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) { void *backing_store; struct signal_page *page; + unsigned int slot; + int i; page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); if (!page) @@ -137,17 +141,23 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p) page->free_slots = SLOTS_PER_PAGE; - backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, + backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, \ get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); if (!backing_store) goto fail_alloc_signal_store; /* prevent user-mode info leaks */ - memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, - KFD_SIGNAL_EVENT_LIMIT * 8); - + memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, KFD_SIGNAL_EVENT_LIMIT * 8); page->kernel_address = backing_store; + /* Set bits of debug events to prevent allocation */ + for (i = 0 ; i < KFD_DEBUG_EVENT_LIMIT ; i++) { + slot = (i << KFD_DEBUG_EVENT_SHIFT) | + KFD_DEBUG_EVENT_MASK; + __set_bit(slot, page->used_slot_bitmap); + page->free_slots--; + } + if (list_empty(&p->signal_event_pages)) page->page_index = 0; else @@ -169,10 +179,10 @@ fail_alloc_signal_page: return false; } -static bool allocate_event_notification_slot(struct file *devkfd, - struct kfd_process *p, - struct signal_page **page, - unsigned int *signal_slot_index) +static bool +allocate_event_notification_slot(struct file *devkfd, struct kfd_process *p, + struct signal_page **page, + unsigned int *signal_slot_index) { bool ret; @@ -186,6 +196,88 @@ static bool allocate_event_notification_slot(struct file *devkfd, return ret; } +static bool +allocate_signal_page_dgpu(struct kfd_process *p, + uint64_t *kernel_address, uint64_t handle) +{ + struct signal_page *my_page; + + my_page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL); + if (!my_page) + return false; + + /* prevent user-mode info leaks */ + memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, + KFD_SIGNAL_EVENT_LIMIT * 8); + + my_page->kernel_address = kernel_address; + my_page->handle = handle; + my_page->user_address = NULL; + my_page->free_slots = SLOTS_PER_PAGE; + if (list_empty(&p->signal_event_pages)) + my_page->page_index = 0; + else + my_page->page_index = list_tail_entry(&p->signal_event_pages, + struct signal_page, + event_pages)->page_index + 1; + + pr_debug("allocated new event signal page at %p, for process %p\n", + my_page, p); + pr_debug("page index is %d\n", my_page->page_index); + + list_add(&my_page->event_pages, &p->signal_event_pages); + + return true; +} + +void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle) +{ + struct signal_page *page, *tmp; + + list_for_each_entry_safe(page, tmp, &p->signal_event_pages, + event_pages) { + if (page->handle == handle) { + list_del(&page->event_pages); + kfree(page); + break; + } + } +} + +static bool +allocate_debug_event_notification_slot(struct file *devkfd, + struct kfd_process *p, + struct signal_page **out_page, + unsigned int *out_slot_index) +{ + struct signal_page *page; + unsigned int slot; + bool ret; + + if (list_empty(&p->signal_event_pages)) { + ret = allocate_signal_page(devkfd, p); + if (ret == false) + return ret; + } + + page = list_entry((&p->signal_event_pages)->next, struct signal_page, + event_pages); + slot = (p->debug_event_count << KFD_DEBUG_EVENT_SHIFT) | + KFD_DEBUG_EVENT_MASK; + + pr_debug("page == %p\n", page); + pr_debug("slot == %d\n", slot); + + page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT; + *out_page = page; + *out_slot_index = slot; + + pr_debug("allocated debug event signal slot in page %p, slot %d\n", + page, slot); + + return true; +} + /* Assumes that the process's event_mutex is locked. */ static void release_event_notification_slot(struct signal_page *page, size_t slot_index) @@ -202,10 +294,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, { struct signal_page *page; - /* - * This is safe because we don't delete signal pages until the - * process exits. - */ + /* This is safe because we don't delete signal pages until the process exits. */ list_for_each_entry(page, &p->signal_event_pages, event_pages) if (page->page_index == page_index) return page; @@ -213,10 +302,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p, return NULL; } -/* - * Assumes that p->event_mutex is held and of course that p is not going - * away (current or locked). - */ +/* Assumes that p->event_mutex is held and of course that p is not going away (current or locked). */ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) { struct kfd_event *ev; @@ -231,32 +317,27 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) static u32 make_signal_event_id(struct signal_page *page, unsigned int signal_slot_index) { - return page->page_index | - (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); + return page->page_index | (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT); } -/* - * Produce a kfd event id for a nonsignal event. - * These are arbitrary numbers, so we do a sequential search through - * the hash table for an unused number. +/* Produce a kfd event id for a nonsignal event. + * These are arbitrary numbers, so we do a sequential search through the hash table + * for an unused number. */ static u32 make_nonsignal_event_id(struct kfd_process *p) { u32 id; for (id = p->next_nonsignal_event_id; - id < KFD_LAST_NONSIGNAL_EVENT_ID && - lookup_event_by_id(p, id) != NULL; - id++) + id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL; + id++) ; if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { - /* - * What if id == LAST_NONSIGNAL_EVENT_ID - 1? - * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so - * the first loop fails immediately and we proceed with the - * wraparound loop below. + /* What if id == LAST_NONSIGNAL_EVENT_ID - 1? + * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so the first loop + * fails immediately and we proceed with the wraparound loop below. */ p->next_nonsignal_event_id = id + 1; @@ -264,54 +345,68 @@ static u32 make_nonsignal_event_id(struct kfd_process *p) } for (id = KFD_FIRST_NONSIGNAL_EVENT_ID; - id < KFD_LAST_NONSIGNAL_EVENT_ID && - lookup_event_by_id(p, id) != NULL; - id++) + id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL; + id++) ; if (id < KFD_LAST_NONSIGNAL_EVENT_ID) { p->next_nonsignal_event_id = id + 1; return id; + } else { + p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; + return 0; } - - p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; - return 0; } -static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p, - struct signal_page *page, - unsigned int signal_slot) +static struct kfd_event * +lookup_event_by_page_slot(struct kfd_process *p, + struct signal_page *page, unsigned int signal_slot) { return lookup_event_by_id(p, make_signal_event_id(page, signal_slot)); } -static int create_signal_event(struct file *devkfd, - struct kfd_process *p, - struct kfd_event *ev) +static int +create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev) { - if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { + if ((ev->type == KFD_EVENT_TYPE_SIGNAL) && + (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT)) { pr_warn("amdkfd: Signal event wasn't created because limit was reached\n"); return -ENOMEM; + } else if ((ev->type == KFD_EVENT_TYPE_DEBUG) && + (p->debug_event_count == KFD_DEBUG_EVENT_LIMIT)) { + pr_warn("amdkfd: Debug event wasn't created because limit was reached\n"); + return -ENOMEM; } - if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page, + if (ev->type == KFD_EVENT_TYPE_SIGNAL) { + if (!allocate_event_notification_slot(devkfd, p, + &ev->signal_page, &ev->signal_slot_index)) { - pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); - return -ENOMEM; - } + pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n"); + return -ENOMEM; + } - p->signal_event_count++; + p->signal_event_count++; - ev->user_signal_address = - &ev->signal_page->user_address[ev->signal_slot_index]; + if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) == + KFD_DEBUG_EVENT_MASK) + p->signal_event_count++; - ev->event_id = make_signal_event_id(ev->signal_page, - ev->signal_slot_index); + } else if (ev->type == KFD_EVENT_TYPE_DEBUG) { + if (!allocate_debug_event_notification_slot(devkfd, p, + &ev->signal_page, + &ev->signal_slot_index)) { + pr_warn("amdkfd: Debug event wasn't created because out of kernel memory\n"); + return -ENOMEM; + } - pr_debug("signal event number %zu created with id %d, address %p\n", - p->signal_event_count, ev->event_id, - ev->user_signal_address); + p->debug_event_count++; + } + + ev->user_signal_address = &ev->signal_page->user_address[ev->signal_slot_index]; + + ev->event_id = make_signal_event_id(ev->signal_page, ev->signal_slot_index); pr_debug("signal event number %zu created with id %d, address %p\n", p->signal_event_count, ev->event_id, @@ -320,12 +415,10 @@ static int create_signal_event(struct file *devkfd, return 0; } -/* - * No non-signal events are supported yet. - * We create them as events that never signal. - * Set event calls from user-mode are failed. - */ -static int create_other_event(struct kfd_process *p, struct kfd_event *ev) +/* No non-signal events are supported yet. + * We create them as events that never signal. Set event calls from user-mode are failed. */ +static int +create_other_event(struct kfd_process *p, struct kfd_event *ev) { ev->event_id = make_nonsignal_event_id(p); if (ev->event_id == 0) @@ -341,20 +434,25 @@ void kfd_event_init_process(struct kfd_process *p) INIT_LIST_HEAD(&p->signal_event_pages); p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID; p->signal_event_count = 0; + p->debug_event_count = 0; } static void destroy_event(struct kfd_process *p, struct kfd_event *ev) { if (ev->signal_page != NULL) { - release_event_notification_slot(ev->signal_page, - ev->signal_slot_index); - p->signal_event_count--; + if (ev->type == KFD_EVENT_TYPE_SIGNAL) { + release_event_notification_slot(ev->signal_page, + ev->signal_slot_index); + p->signal_event_count--; + if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) == + KFD_DEBUG_EVENT_MASK) + p->signal_event_count--; + } else if (ev->type == KFD_EVENT_TYPE_DEBUG) { + p->debug_event_count--; + } } - /* - * Abandon the list of waiters. Individual waiting threads will - * clean up their own data. - */ + /* Abandon the list of waiters. Individual waiting threads will clean up their own data.*/ list_del(&ev->waiters); hash_del(&ev->events); @@ -371,18 +469,17 @@ static void destroy_events(struct kfd_process *p) destroy_event(p, ev); } -/* - * We assume that the process is being destroyed and there is no need to - * unmap the pages or keep bookkeeping data in order. - */ +/* We assume that the process is being destroyed and there is no need to unmap the pages + * or keep bookkeeping data in order. */ static void shutdown_signal_pages(struct kfd_process *p) { struct signal_page *page, *tmp; - list_for_each_entry_safe(page, tmp, &p->signal_event_pages, - event_pages) { - free_pages((unsigned long)page->kernel_address, - get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + list_for_each_entry_safe(page, tmp, &p->signal_event_pages, event_pages) { + if (page->user_address) { + free_pages((unsigned long)page->kernel_address, + get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + } kfree(page); } } @@ -395,8 +492,7 @@ void kfd_event_free_process(struct kfd_process *p) static bool event_can_be_gpu_signaled(const struct kfd_event *ev) { - return ev->type == KFD_EVENT_TYPE_SIGNAL || - ev->type == KFD_EVENT_TYPE_DEBUG; + return ev->type == KFD_EVENT_TYPE_SIGNAL || ev->type == KFD_EVENT_TYPE_DEBUG; } static bool event_can_be_cpu_signaled(const struct kfd_event *ev) @@ -407,11 +503,12 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index) + uint64_t *event_page_offset, uint32_t *event_slot_index, + void *kern_addr) { int ret = 0; - struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); + struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; @@ -421,17 +518,20 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, INIT_LIST_HEAD(&ev->waiters); - *event_page_offset = 0; - mutex_lock(&p->event_mutex); + if (kern_addr && list_empty(&p->signal_event_pages)) + allocate_signal_page_dgpu(p, kern_addr, *event_page_offset); + + *event_page_offset = 0; + switch (event_type) { case KFD_EVENT_TYPE_SIGNAL: case KFD_EVENT_TYPE_DEBUG: ret = create_signal_event(devkfd, p, ev); if (!ret) { *event_page_offset = (ev->signal_page->page_index | - KFD_MMAP_EVENTS_MASK); + KFD_MMAP_TYPE_EVENTS); *event_page_offset <<= PAGE_SHIFT; *event_slot_index = ev->signal_slot_index; } @@ -538,8 +638,7 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id) static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev) { - page_slots(ev->signal_page)[ev->signal_slot_index] = - UNSIGNALED_EVENT_SLOT; + page_slots(ev->signal_page)[ev->signal_slot_index] = UNSIGNALED_EVENT_SLOT; } static bool is_slot_signaled(struct signal_page *page, unsigned int index) @@ -547,8 +646,7 @@ static bool is_slot_signaled(struct signal_page *page, unsigned int index) return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT; } -static void set_event_from_interrupt(struct kfd_process *p, - struct kfd_event *ev) +static void set_event_from_interrupt(struct kfd_process *p, struct kfd_event *ev) { if (ev && event_can_be_gpu_signaled(ev)) { acknowledge_signal(p, ev); @@ -561,42 +659,39 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, { struct kfd_event *ev; - /* - * Because we are called from arbitrary context (workqueue) as opposed + /* Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function returns a locked process. - */ + * running so the lookup function returns a read-locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - if (!p) return; /* Presumably process exited. */ mutex_lock(&p->event_mutex); - if (valid_id_bits >= INTERRUPT_DATA_BITS) { + if ((valid_id_bits >= INTERRUPT_DATA_BITS) && + ((partial_id & KFD_DEBUG_EVENT_MASK) == + KFD_DEBUG_EVENT_MASK)) { /* Partial ID is a full ID. */ ev = lookup_event_by_id(p, partial_id); set_event_from_interrupt(p, ev); } else { - /* - * Partial ID is in fact partial. For now we completely - * ignore it, but we could use any bits we did receive to - * search faster. - */ + /* Partial ID is in fact partial. For now we completely ignore it, + * but we could use any bits we did receive to search faster. */ struct signal_page *page; unsigned i; - list_for_each_entry(page, &p->signal_event_pages, event_pages) - for (i = 0; i < SLOTS_PER_PAGE; i++) + list_for_each_entry(page, &p->signal_event_pages, event_pages) { + for (i = 0; i < SLOTS_PER_PAGE; i++) { if (is_slot_signaled(page, i)) { - ev = lookup_event_by_page_slot(p, - page, i); + ev = lookup_event_by_page_slot(p, page, i); set_event_from_interrupt(p, ev); } + } + } } mutex_unlock(&p->event_mutex); - mutex_unlock(&p->mutex); + up_read(&p->lock); } static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) @@ -604,20 +699,20 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) struct kfd_event_waiter *event_waiters; uint32_t i; - event_waiters = kmalloc_array(num_events, - sizeof(struct kfd_event_waiter), - GFP_KERNEL); + event_waiters = kmalloc(num_events * sizeof(struct kfd_event_waiter), GFP_KERNEL); - for (i = 0; (event_waiters) && (i < num_events) ; i++) { - INIT_LIST_HEAD(&event_waiters[i].waiters); - event_waiters[i].sleeping_task = current; - event_waiters[i].activated = false; + if (event_waiters) { + for (i = 0; i < num_events; i++) { + INIT_LIST_HEAD(&event_waiters[i].waiters); + event_waiters[i].sleeping_task = current; + event_waiters[i].activated = false; + } } return event_waiters; } -static int init_event_waiter(struct kfd_process *p, +static int init_event_waiter_get_status(struct kfd_process *p, struct kfd_event_waiter *waiter, uint32_t event_id, uint32_t input_index) @@ -632,13 +727,21 @@ static int init_event_waiter(struct kfd_process *p, waiter->activated = ev->signaled; ev->signaled = ev->signaled && !ev->auto_reset; - list_add(&waiter->waiters, &ev->waiters); - return 0; } +static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) +{ + struct kfd_event *ev = waiter->event; + + /* Only add to the wait list if we actually need to + * wait on this event. */ + if (!waiter->activated) + list_add(&waiter->waiters, &ev->waiters); +} + static bool test_event_condition(bool all, uint32_t num_events, - struct kfd_event_waiter *event_waiters) + struct kfd_event_waiter *event_waiters) { uint32_t i; uint32_t activated_count = 0; @@ -663,23 +766,15 @@ static bool copy_signaled_event_data(uint32_t num_events, struct kfd_event_waiter *event_waiters, struct kfd_event_data __user *data) { - struct kfd_hsa_memory_exception_data *src; - struct kfd_hsa_memory_exception_data __user *dst; - struct kfd_event_waiter *waiter; - struct kfd_event *event; uint32_t i; - for (i = 0; i < num_events; i++) { - waiter = &event_waiters[i]; - event = waiter->event; - if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) { - dst = &data[waiter->input_index].memory_exception_data; - src = &event->memory_exception_data; - if (copy_to_user(dst, src, - sizeof(struct kfd_hsa_memory_exception_data))) + for (i = 0; i < num_events; i++) + if (event_waiters[i].activated && + event_waiters[i].event->type == KFD_EVENT_TYPE_MEMORY) + if (copy_to_user(&data[event_waiters[i].input_index].memory_exception_data, + &event_waiters[i].event->memory_exception_data, + sizeof(struct kfd_hsa_memory_exception_data))) return false; - } - } return true; @@ -695,11 +790,9 @@ static long user_timeout_to_jiffies(uint32_t user_timeout_ms) if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE) return MAX_SCHEDULE_TIMEOUT; - /* - * msecs_to_jiffies interprets all values above 2^31-1 as infinite, + /* msecs_to_jiffies interprets all values above 2^31-1 as infinite, * but we consider them finite. - * This hack is wrong, but nobody is likely to notice. - */ + * This hack is wrong, but nobody is likely to notice. */ user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF); return msecs_to_jiffies(user_timeout_ms) + 1; @@ -724,11 +817,16 @@ int kfd_wait_on_events(struct kfd_process *p, (struct kfd_event_data __user *) data; uint32_t i; int ret = 0; + struct kfd_event_waiter *event_waiters = NULL; long timeout = user_timeout_to_jiffies(user_timeout_ms); mutex_lock(&p->event_mutex); + /* Set to something unreasonable - this is really + * just a bool for now. */ + *wait_result = KFD_WAIT_TIMEOUT; + event_waiters = alloc_event_waiters(num_events); if (!event_waiters) { ret = -ENOMEM; @@ -742,14 +840,34 @@ int kfd_wait_on_events(struct kfd_process *p, sizeof(struct kfd_event_data))) goto fail; - ret = init_event_waiter(p, &event_waiters[i], + ret = init_event_waiter_get_status(p, &event_waiters[i], event_data.event_id, i); if (ret) goto fail; } + /* Check condition once. */ + if (test_event_condition(all, num_events, event_waiters)) { + if (copy_signaled_event_data(num_events, + event_waiters, events)) + *wait_result = KFD_WAIT_COMPLETE; + else + *wait_result = KFD_WAIT_ERROR; + free_waiters(num_events, event_waiters); + } else { + /* Add to wait lists if we need to wait. */ + for (i = 0; i < num_events; i++) + init_event_waiter_add_to_waitlist(&event_waiters[i]); + } + mutex_unlock(&p->event_mutex); + /* Return if all waits were already satisfied. */ + if (*wait_result != KFD_WAIT_TIMEOUT) { + __set_current_state(TASK_RUNNING); + return ret; + } + while (true) { if (fatal_signal_pending(current)) { ret = -EINTR; @@ -758,17 +876,17 @@ int kfd_wait_on_events(struct kfd_process *p, if (signal_pending(current)) { /* - * This is wrong when a nonzero, non-infinite timeout - * is specified. We need to use - * ERESTARTSYS_RESTARTBLOCK, but struct restart_block - * contains a union with data for each user and it's - * in generic kernel code that I don't want to - * touch yet. + * This is wrong when a nonzero, non-infinite timeout is specified. + * We need to use ERESTARTSYS_RESTARTBLOCK, but struct restart_block + * contains a union with data for each user and it's in generic + * kernel code that I don't want to touch yet. */ ret = -ERESTARTSYS; break; } + set_current_state(TASK_INTERRUPTIBLE); + if (test_event_condition(all, num_events, event_waiters)) { if (copy_signaled_event_data(num_events, event_waiters, events)) @@ -783,7 +901,7 @@ int kfd_wait_on_events(struct kfd_process *p, break; } - timeout = schedule_timeout_interruptible(timeout); + timeout = schedule_timeout(timeout); } __set_current_state(TASK_RUNNING); @@ -823,8 +941,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) page = lookup_signal_page_by_index(p, page_index); if (!page) { /* Probably KFD bug, but mmap is user-accessible. */ - pr_debug("signal page could not be found for page_index %u\n", - page_index); + pr_debug("signal page could not be found for page_index %u\n", page_index); return -EINVAL; } @@ -856,23 +973,29 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) static void lookup_events_by_type_and_signal(struct kfd_process *p, int type, void *event_data) { - struct kfd_hsa_memory_exception_data *ev_data; struct kfd_event *ev; int bkt; bool send_signal = true; - ev_data = (struct kfd_hsa_memory_exception_data *) event_data; - - hash_for_each(p->events, bkt, ev, events) + hash_for_each(p->events, bkt, ev, events) { if (ev->type == type) { send_signal = false; dev_dbg(kfd_device, "Event found: id %X type %d", ev->event_id, ev->type); set_event(ev); - if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data) - ev->memory_exception_data = *ev_data; + if (ev->type == KFD_EVENT_TYPE_MEMORY && event_data) + ev->memory_exception_data = + *(struct kfd_hsa_memory_exception_data *)event_data; } + } + + if (type == KFD_EVENT_TYPE_MEMORY) { + dev_warn(kfd_device, + "Sending SIGSEGV to HSA Process with PID %d ", + p->lead_thread->pid); + send_sig(SIGSEGV, p->lead_thread, 0); + } /* Send SIGTERM no event of type "type" has been found*/ if (send_signal) { @@ -899,7 +1022,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function returns a locked process. + * running so the lookup function returns a read-locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -914,24 +1037,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, memory_exception_data.gpu_id = dev->id; memory_exception_data.va = address; /* Set failure reason */ - memory_exception_data.failure.NotPresent = 1; - memory_exception_data.failure.NoExecute = 0; - memory_exception_data.failure.ReadOnly = 0; + memory_exception_data.failure.NotPresent = true; + memory_exception_data.failure.NoExecute = false; + memory_exception_data.failure.ReadOnly = false; if (vma) { if (vma->vm_start > address) { - memory_exception_data.failure.NotPresent = 1; - memory_exception_data.failure.NoExecute = 0; - memory_exception_data.failure.ReadOnly = 0; + memory_exception_data.failure.NotPresent = true; + memory_exception_data.failure.NoExecute = false; + memory_exception_data.failure.ReadOnly = false; } else { - memory_exception_data.failure.NotPresent = 0; + memory_exception_data.failure.NotPresent = false; if (is_write_requested && !(vma->vm_flags & VM_WRITE)) - memory_exception_data.failure.ReadOnly = 1; + memory_exception_data.failure.ReadOnly = true; else - memory_exception_data.failure.ReadOnly = 0; + memory_exception_data.failure.ReadOnly = false; if (is_execute_requested && !(vma->vm_flags & VM_EXEC)) - memory_exception_data.failure.NoExecute = 1; + memory_exception_data.failure.NoExecute = true; else - memory_exception_data.failure.NoExecute = 0; + memory_exception_data.failure.NoExecute = false; } } @@ -944,7 +1067,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, &memory_exception_data); mutex_unlock(&p->event_mutex); - mutex_unlock(&p->mutex); + up_read(&p->lock); } void kfd_signal_hw_exception_event(unsigned int pasid) @@ -952,7 +1075,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid) /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function returns a locked process. + * running so the lookup function returns a read-locked process. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -965,5 +1088,42 @@ void kfd_signal_hw_exception_event(unsigned int pasid) lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); mutex_unlock(&p->event_mutex); - mutex_unlock(&p->mutex); + up_read(&p->lock); +} + +void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + struct kfd_vm_fault_info *info) +{ + struct kfd_event *ev; + int bkt; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_hsa_memory_exception_data memory_exception_data; + + if (!p) + return; /* Presumably process exited. */ + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.gpu_id = dev->id; + /* Set failure reason */ + if (info) { + memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; + memory_exception_data.failure.NotPresent = + info->prot_valid ? true : false; + memory_exception_data.failure.NoExecute = + info->prot_exec ? true : false; + memory_exception_data.failure.ReadOnly = + info->prot_write ? true : false; + } + mutex_lock(&p->event_mutex); + + hash_for_each(p->events, bkt, ev, events) { + if (ev->type == KFD_EVENT_TYPE_MEMORY) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + } + + mutex_unlock(&p->event_mutex); + up_read(&p->lock); + } + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h index 28f6838b1f4c..d7987eb80970 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -34,8 +34,7 @@ #define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK #define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX -/* - * Written into kfd_signal_slot_t to indicate that the event is not signaled. +/* Written into kfd_signal_slot_t to indicate that the event is not signaled. * Since the event protocol may need to write the event ID into memory, this * must not be a valid event ID. * For the sake of easy memset-ing, this must be a byte pattern. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 2b655103ba79..587f84714ca0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -33,7 +33,7 @@ #include <linux/time.h> #include "kfd_priv.h" #include <linux/mm.h> -#include <linux/mman.h> +#include <uapi/asm-generic/mman-common.h> #include <asm/processor.h> /* @@ -278,21 +278,36 @@ #define MAKE_GPUVM_APP_BASE(gpu_num) \ (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) -#define MAKE_GPUVM_APP_LIMIT(base) \ - (((uint64_t)(base) & \ - 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) +#define MAKE_GPUVM_APP_LIMIT(base, size) \ + (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) -#define MAKE_SCRATCH_APP_BASE(gpu_num) \ - (((uint64_t)(gpu_num) << 61) + 0x100000000L) +#define MAKE_SCRATCH_APP_BASE() \ + (((uint64_t)(0x1UL) << 61) + 0x100000000L) #define MAKE_SCRATCH_APP_LIMIT(base) \ (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) -#define MAKE_LDS_APP_BASE(gpu_num) \ - (((uint64_t)(gpu_num) << 61) + 0x0) +#define MAKE_LDS_APP_BASE() \ + (((uint64_t)(0x1UL) << 61) + 0x0) + #define MAKE_LDS_APP_LIMIT(base) \ (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +#define DGPU_VM_BASE_DEFAULT 0x100000 + +int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, + uint64_t base, uint64_t limit) +{ + if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) { + pr_err("Set dgpu vm base 0x%llx failed.\n", base); + return -EINVAL; + } + pdd->dgpu_base = base; + pdd->dgpu_limit = limit; + return 0; +} + int kfd_init_apertures(struct kfd_process *process) { uint8_t id = 0; @@ -300,13 +315,16 @@ int kfd_init_apertures(struct kfd_process *process) struct kfd_process_device *pdd; /*Iterating over all devices*/ - while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && - id < NUM_OF_SUPPORTED_GPUS) { + while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { + if (!dev) { + id++; /* Skip non GPU devices */ + continue; + } pdd = kfd_create_process_device_data(dev, process); if (pdd == NULL) { pr_err("Failed to create process device data\n"); - return -1; + goto err; } /* * For 64 bit process aperture will be statically reserved in @@ -322,19 +340,24 @@ int kfd_init_apertures(struct kfd_process *process) * node id couldn't be 0 - the three MSB bits of * aperture shoudn't be 0 */ - pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); + pdd->lds_base = MAKE_LDS_APP_BASE(); pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); - pdd->gpuvm_limit = - MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); + pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( + pdd->gpuvm_base, + dev->shared_resources.gpuvm_size); - pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); + pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); + + if (KFD_IS_DGPU(dev->device_info->asic_family)) + pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT; + } dev_dbg(kfd_device, "node id %u\n", id); @@ -350,6 +373,32 @@ int kfd_init_apertures(struct kfd_process *process) } return 0; + +err: + return -1; } +void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid) +{ + uint8_t vmid; + int first_vmid_to_scan = 8; + int last_vmid_to_scan = 15; + const struct kfd2kgd_calls *f2g = dev->kfd2kgd; + /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. ATC_VMID15_PASID_MAPPING + * to check which VMID the current process is mapped to + * and flush TLB for this VMID if found*/ + for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { + if (f2g->get_atc_vmid_pasid_mapping_valid( + dev->kgd, vmid)) { + if (f2g->get_atc_vmid_pasid_mapping_pasid( + dev->kgd, vmid) == pasid) { + dev_dbg(kfd_device, + "TLB of vmid %u", vmid); + f2g->write_vmid_invalidate_request( + dev->kgd, vmid); + break; + } + } + } +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 7f134aa9bfd3..a8cdbc812d00 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -172,8 +172,7 @@ static void interrupt_wq(struct work_struct *work) sizeof(uint32_t))]; while (dequeue_ih_ring_entry(dev, ih_ring_entry)) - dev->device_info->event_interrupt_class->interrupt_wq(dev, - ih_ring_entry); + dev->device_info->event_interrupt_class->interrupt_wq(dev, ih_ring_entry); } bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) @@ -181,8 +180,7 @@ bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) /* integer and bitwise OR so there is no boolean short-circuiting */ unsigned wanted = 0; - wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, - ih_ring_entry); + wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, ih_ring_entry); return wanted != 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 9beae87aadd5..513cfe642c22 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -47,6 +47,9 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, pr_debug("amdkfd: In func %s initializing queue type %d size %d\n", __func__, KFD_QUEUE_TYPE_HIQ, queue_size); + memset(&prop, 0, sizeof(prop)); + memset(&nop, 0, sizeof(nop)); + nop.opcode = IT_NOP; nop.type = PM4_TYPE_3; nop.u32all |= PM4_COUNT_ZERO; @@ -121,7 +124,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, prop.eop_ring_buffer_address = kq->eop_gpu_addr; prop.eop_ring_buffer_size = PAGE_SIZE; - if (init_queue(&kq->queue, prop) != 0) + if (init_queue(&kq->queue, &prop) != 0) goto err_init_queue; kq->queue->device = dev; @@ -140,7 +143,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->queue->pipe = KFD_CIK_HIQ_PIPE; kq->queue->queue = KFD_CIK_HIQ_QUEUE; kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, - kq->queue->queue, NULL); + kq->queue->queue, NULL, 0); } else { /* allocate fence for DIQ */ @@ -210,20 +213,23 @@ static int acquire_packet_buffer(struct kernel_queue *kq, BUG_ON(!kq || !buffer_ptr); + /* When rptr == wptr, the buffer is empty. + * When rptr == wptr + 1, the buffer is full. + * It is always rptr that advances to the position of wptr, rather than + * the opposite. So we can only use up to queue_size_dwords - 1 dwords. + */ rptr = *kq->rptr_kernel; wptr = *kq->wptr_kernel; queue_address = (unsigned int *)kq->pq_kernel_addr; queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); - pr_debug("rptr: %d\n", rptr); - pr_debug("wptr: %d\n", wptr); - pr_debug("queue_address 0x%p\n", queue_address); + pr_debug("amdkfd: In func %s\n rptr: %d\n wptr: %d\n queue_address 0x%p\n", + __func__, rptr, wptr, queue_address); - available_size = (rptr - 1 - wptr + queue_size_dwords) % + available_size = (rptr + queue_size_dwords - 1 - wptr) % queue_size_dwords; - if (packet_size_in_dwords >= queue_size_dwords || - packet_size_in_dwords >= available_size) { + if (packet_size_in_dwords > available_size) { /* * make sure calling functions know * acquire_packet_buffer() failed @@ -233,6 +239,13 @@ static int acquire_packet_buffer(struct kernel_queue *kq, } if (wptr + packet_size_in_dwords >= queue_size_dwords) { + /* make sure after rolling back to position 0, there is + * still enough space. */ + if (packet_size_in_dwords >= rptr) { + *buffer_ptr = NULL; + return -ENOMEM; + } + /* fill nops, roll back and start at position 0 */ while (wptr > 0) { queue_address[wptr] = kq->nop_packet; wptr = (wptr + 1) % queue_size_dwords; @@ -292,6 +305,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, switch (dev->device_info->asic_family) { case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: kernel_queue_init_vi(&kq->ops_asic_specific); break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 850a5623661f..e9b886d7a041 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -29,10 +29,11 @@ #define KFD_DRIVER_AUTHOR "AMD Inc. and others" #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" -#define KFD_DRIVER_DATE "20150421" -#define KFD_DRIVER_MAJOR 0 -#define KFD_DRIVER_MINOR 7 -#define KFD_DRIVER_PATCHLEVEL 2 +#define KFD_DRIVER_DATE "20160129" +#define KFD_DRIVER_MAJOR 1 +#define KFD_DRIVER_MINOR 8 +#define KFD_DRIVER_PATCHLEVEL 1 +#define KFD_DRIVER_RC_LEVEL "" static const struct kgd2kfd_calls kgd2kfd = { .exit = kgd2kfd_exit, @@ -42,6 +43,10 @@ static const struct kgd2kfd_calls kgd2kfd = { .interrupt = kgd2kfd_interrupt, .suspend = kgd2kfd_suspend, .resume = kgd2kfd_resume, + .evict_bo = kgd2kfd_evict_bo, + .restore = kgd2kfd_restore, + .quiesce_mm = kgd2kfd_quiesce_mm, + .resume_mm = kgd2kfd_resume_mm, }; int sched_policy = KFD_SCHED_POLICY_HWS; @@ -49,6 +54,15 @@ module_param(sched_policy, int, 0444); MODULE_PARM_DESC(sched_policy, "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); +int hws_max_conc_proc = 0; +module_param(hws_max_conc_proc, int, 0444); +MODULE_PARM_DESC(hws_max_conc_proc, + "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency (Default), #VMIDs for KFD = Maximum)"); + +int cwsr_enable = 1; +module_param(cwsr_enable, int, 0444); +MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); + int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; module_param(max_num_of_queues_per_device, int, 0444); MODULE_PARM_DESC(max_num_of_queues_per_device, @@ -61,6 +75,11 @@ MODULE_PARM_DESC(send_sigterm, static int amdkfd_init_completed; +int debug_largebar = 0; +module_param(debug_largebar, int, 0444); +MODULE_PARM_DESC(debug_largebar, + "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); + int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f) { if (!amdkfd_init_completed) @@ -149,4 +168,5 @@ MODULE_DESCRIPTION(KFD_DRIVER_DESC); MODULE_LICENSE("GPL and additional rights"); MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "." __stringify(KFD_DRIVER_MINOR) "." - __stringify(KFD_DRIVER_PATCHLEVEL)); + __stringify(KFD_DRIVER_PATCHLEVEL) + KFD_DRIVER_RC_LEVEL); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index b1ef1368c3bb..ef1dc9b4c20e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -31,6 +31,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, return mqd_manager_init_cik(type, dev); case CHIP_CARRIZO: return mqd_manager_init_vi(type, dev); + case CHIP_TONGA: + case CHIP_FIJI: + return mqd_manager_init_vi_tonga(type, dev); } return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index 213a71e0b6c7..eb6019259da0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -67,7 +67,8 @@ struct mqd_manager { int (*load_mqd)(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, - uint32_t __user *wptr); + uint32_t __user *wptr, + uint32_t page_table_base); int (*update_mqd)(struct mqd_manager *mm, void *mqd, struct queue_properties *q); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index d83de985e88c..44dcd9cace4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -29,11 +29,71 @@ #include "cik_structs.h" #include "oss/oss_2_4_sh_mask.h" +#define AQL_ENABLE 1 + static inline struct cik_mqd *get_mqd(void *mqd) { return (struct cik_mqd *)mqd; } +static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +{ + return (struct cik_sdma_rlc_registers *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_mqd *m; + struct kfd_cu_info cu_info; + uint32_t mgmt_se_mask; + uint32_t cu_sh_mask, cu_sh_shift; + uint32_t cu_mask; + int se, sh; + + if (q->cu_mask == 0) + return; + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = 0; + m->compute_static_thread_mgmt_se1 = 0; + m->compute_static_thread_mgmt_se2 = 0; + m->compute_static_thread_mgmt_se3 = 0; + + mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); + cu_mask = q->cu_mask; + for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) { + mgmt_se_mask = 0; + for (sh = 0; sh < 2 && cu_mask; sh++) { + cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]); + cu_sh_mask = (1 << cu_sh_shift) - 1; + mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16); + cu_mask >>= cu_sh_shift; + } + switch (se) { + case 0: + m->compute_static_thread_mgmt_se0 = mgmt_se_mask; + break; + case 1: + m->compute_static_thread_mgmt_se1 = mgmt_se_mask; + break; + case 2: + m->compute_static_thread_mgmt_se2 = mgmt_se_mask; + break; + case 3: + m->compute_static_thread_mgmt_se3 = mgmt_se_mask; + break; + default: + break; + } + } + pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -150,15 +210,16 @@ static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, } static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr) + uint32_t queue_id, uint32_t __user *wptr, + uint32_t page_table_base) { return mm->dev->kfd2kgd->hqd_load - (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); + (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base); } static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, - uint32_t __user *wptr) + uint32_t __user *wptr, uint32_t page_table_base) { return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); } @@ -195,11 +256,14 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control |= NO_UPDATE_RPTR; } + update_cu_mask(mm, mqd, q); + m->cp_hqd_active = 0; q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { + q->queue_percent > 0 && + !q->is_evicted) { m->cp_hqd_active = 1; q->is_active = true; } @@ -215,8 +279,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, BUG_ON(!mm || !mqd || !q); m = get_sdma_mqd(mqd); - m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << - SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; @@ -237,7 +301,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { + q->queue_percent > 0 && + !q->is_evicted) { m->sdma_rlc_rb_cntl |= 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; @@ -386,7 +451,8 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { + q->queue_percent > 0 && + !q->is_evicted) { m->cp_hqd_active = 1; q->is_active = true; } @@ -394,16 +460,6 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, return 0; } -struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -{ - struct cik_sdma_rlc_registers *m; - - BUG_ON(!mqd); - - m = (struct cik_sdma_rlc_registers *)mqd; - - return m; -} struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index fa32c32fa1c2..b5fb78379e88 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -26,9 +26,9 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" #include "vi_structs.h" -#include "gca/gfx_8_0_sh_mask.h" -#include "gca/gfx_8_0_enum.h" - +#include "asic_reg/gca/gfx_8_0_sh_mask.h" +#include "asic_reg/gca/gfx_8_0_enum.h" +#include "oss/oss_3_0_sh_mask.h" #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 static inline struct vi_mqd *get_mqd(void *mqd) @@ -36,6 +36,64 @@ static inline struct vi_mqd *get_mqd(void *mqd) return (struct vi_mqd *)mqd; } +static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct vi_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct vi_mqd *m; + struct kfd_cu_info cu_info; + uint32_t mgmt_se_mask; + uint32_t cu_sh_mask, cu_sh_shift; + uint32_t cu_mask; + int se, sh; + + if (q->cu_mask == 0) + return; + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = 0; + m->compute_static_thread_mgmt_se1 = 0; + m->compute_static_thread_mgmt_se2 = 0; + m->compute_static_thread_mgmt_se3 = 0; + + mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); + cu_mask = q->cu_mask; + for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) { + mgmt_se_mask = 0; + for (sh = 0; sh < 2 && cu_mask; sh++) { + cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]); + cu_sh_mask = (1 << cu_sh_shift) - 1; + mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16); + cu_mask >>= cu_sh_shift; + } + switch (se) { + case 0: + m->compute_static_thread_mgmt_se0 = mgmt_se_mask; + break; + case 1: + m->compute_static_thread_mgmt_se1 = mgmt_se_mask; + break; + case 2: + m->compute_static_thread_mgmt_se2 = mgmt_se_mask; + break; + case 3: + m->compute_static_thread_mgmt_se3 = mgmt_se_mask; + break; + default: + break; + } + } + pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -82,6 +140,25 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = 1; + if (q->tba_addr) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->compute_pgm_rsrc2 |= + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); + m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); + m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); + m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); + } + *mqd = m; if (gart_addr != NULL) *gart_addr = addr; @@ -92,10 +169,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, - uint32_t __user *wptr) + uint32_t __user *wptr, uint32_t page_table_base) { return mm->dev->kfd2kgd->hqd_load - (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); + (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base); } static int __update_mqd(struct mqd_manager *mm, void *mqd, @@ -153,12 +230,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; } + if (q->tba_addr) + m->cp_hqd_ctx_save_control = + atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | + mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; + + update_cu_mask(mm, mqd, q); m->cp_hqd_active = 0; q->is_active = false; if (q->queue_size > 0 && q->queue_address != 0 && - q->queue_percent > 0) { + q->queue_percent > 0 && + !q->is_evicted) { m->cp_hqd_active = 1; q->is_active = true; } @@ -173,6 +257,12 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, return __update_mqd(mm, mqd, q, MTYPE_CC, 1); } +static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + return __update_mqd(mm, mqd, q, MTYPE_UC, 0); +} + static int destroy_mqd(struct mqd_manager *mm, void *mqd, enum kfd_preempt_type type, unsigned int timeout, uint32_t pipe_id, @@ -231,6 +321,111 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, return retval; } +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct vi_sdma_mqd *m; + + + BUG_ON(!mm || !mqd || !mqd_mem_obj); + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct vi_sdma_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct vi_sdma_mqd)); + + *mqd = m; + if (gart_addr != NULL) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + BUG_ON(!mm || !mqd); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t __user *wptr, uint32_t page_table_base) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); +} + +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct vi_sdma_mqd *m; + BUG_ON(!mm || !mqd || !q); + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell = q->doorbell_off << + SDMA0_RLC0_DOORBELL__OFFSET__SHIFT | + 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT; + + m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted) { + m->sdmax_rlcx_rb_cntl |= + 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT; + + q->is_active = true; + } + + return 0; +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + + + struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { @@ -266,6 +461,12 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->is_occupied = is_occupied; break; case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; break; default: kfree(mqd); @@ -274,3 +475,17 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, return mqd; } + +struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + mqd = mqd_manager_init_vi(type, dev); + if (!mqd) + return NULL; + if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) + mqd->update_mqd = update_mqd_tonga; + return mqd; +} + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index ca8c09326b31..c5356ebde005 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -57,26 +57,37 @@ static void pm_calc_rlib_size(struct packet_manager *pm, { unsigned int process_count, queue_count; unsigned int map_queue_size; + unsigned int max_proc_per_quantum = 1; - BUG_ON(!pm || !rlib_size || !over_subscription); + struct kfd_dev *dev = pm->dqm->dev; + + BUG_ON(!pm || !rlib_size || !over_subscription || !dev); process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; - /* check if there is over subscription*/ + /* check if there is over subscription + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + *over_subscription = false; - if ((process_count > 1) || + + if (dev->max_proc_per_quantum > 1) + max_proc_per_quantum = dev->max_proc_per_quantum; + + if ((process_count > max_proc_per_quantum) || queue_count > PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE) { *over_subscription = true; pr_debug("kfd: over subscribed runlist\n"); } - map_queue_size = - (pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ? + map_queue_size = KFD_IS_VI(pm->dqm->dev->device_info->asic_family) ? sizeof(struct pm4_mes_map_queues) : sizeof(struct pm4_map_queues); /* calculate run list ib allocation size */ - *rlib_size = process_count * sizeof(struct pm4_map_process) + + *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + queue_count * map_queue_size; /* @@ -103,11 +114,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + mutex_lock(&pm->lock); + retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, &pm->ib_buffer_obj); if (retval != 0) { pr_err("kfd: failed to allocate runlist IB\n"); + mutex_unlock(&pm->lock); return retval; } @@ -116,6 +130,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, memset(*rl_buffer, 0, *rl_buffer_size); pm->allocated = true; + + mutex_unlock(&pm->lock); return retval; } @@ -123,9 +139,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, uint64_t ib, size_t ib_size_in_dwords, bool chain) { struct pm4_runlist *packet; + int concurrent_proc_cnt = 0; + struct kfd_dev *kfd = pm->dqm->dev; BUG_ON(!pm || !buffer || !ib); + /* Determine the number of processes to map together to HW: + * it can not exceed the number of VMIDs available to the + * scheduler, and it is determined by the smaller of the number + * of processes in the runlist and kfd module parameter + * hws_max_conc_proc. + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + concurrent_proc_cnt = min(pm->dqm->processes_count, + kfd->max_proc_per_quantum); + + packet = (struct pm4_runlist *)buffer; memset(buffer, 0, sizeof(struct pm4_runlist)); @@ -136,6 +167,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, packet->bitfields4.chain = chain ? 1 : 0; packet->bitfields4.offload_polling = 0; packet->bitfields4.valid = 1; + packet->bitfields4.process_cnt = concurrent_proc_cnt; packet->ordinal2 = lower_32_bits(ib); packet->bitfields3.ib_base_hi = upper_32_bits(ib); @@ -182,6 +214,90 @@ static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, return 0; } +static int pm_create_map_process_scratch_kv(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) +{ + struct pm4_map_process_scratch_kv *packet; + struct queue *cur; + uint32_t num_queues; + + BUG_ON(!pm || !buffer || !qpd); + + packet = (struct pm4_map_process_scratch_kv *)buffer; + + pr_debug("kfd: In func %s\n", __func__); + + memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); + + packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_map_process_scratch_kv)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields14.gds_size = qpd->gds_size; + packet->bitfields14.num_gws = qpd->num_gws; + packet->bitfields14.num_oac = qpd->num_oac; + num_queues = 0; + list_for_each_entry(cur, &qpd->queues_list, list) + num_queues++; + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static int pm_create_map_process_scratch(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) +{ + struct pm4_map_process_scratch *packet; + struct queue *cur; + uint32_t num_queues; + + BUG_ON(!pm || !buffer || !qpd); + + packet = (struct pm4_map_process_scratch *)buffer; + + pr_debug("kfd: In func %s\n", __func__); + + memset(buffer, 0, sizeof(struct pm4_map_process_scratch)); + + packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_map_process_scratch)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields10.gds_size = qpd->gds_size; + packet->bitfields10.num_gws = qpd->num_gws; + packet->bitfields10.num_oac = qpd->num_oac; + num_queues = 0; + list_for_each_entry(cur, &qpd->queues_list, list) + num_queues++; + packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, struct queue *q, bool is_static) { @@ -219,7 +335,7 @@ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_map_queues__debug_interface_queue_vi; break; case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0_vi; use_static = false; /* no static queues under SDMA */ break; @@ -279,7 +395,7 @@ static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, engine_sel__mes_map_queues__compute; break; case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0; use_static = false; /* no static queues under SDMA */ break; @@ -348,12 +464,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return -ENOMEM; } - retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); + retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); if (retval != 0) return retval; proccesses_mapped++; - inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), + inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), alloc_size_bytes); list_for_each_entry(kq, &qpd->priv_queue_list, list) { @@ -363,8 +479,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n", kq->queue->queue, qpd->is_debug); - if (pm->dqm->dev->device_info->asic_family == - CHIP_CARRIZO) + if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) retval = pm_create_map_queue_vi(pm, &rl_buffer[rl_wptr], kq->queue, @@ -389,8 +504,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n", q->queue, qpd->is_debug); - if (pm->dqm->dev->device_info->asic_family == - CHIP_CARRIZO) + if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family)) retval = pm_create_map_queue_vi(pm, &rl_buffer[rl_wptr], q, @@ -423,7 +537,23 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return 0; } -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) +static int get_map_process_packet_size(void) +{ + return sizeof(struct pm4_map_process); +} + +static int get_map_process_packet_size_scratch_kv(void) +{ + return sizeof(struct pm4_map_process_scratch_kv); +} + +static int get_map_process_packet_size_scratch(void) +{ + return sizeof(struct pm4_map_process_scratch); +} + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, + uint16_t fw_ver) { BUG_ON(!dqm); @@ -434,8 +564,37 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) mutex_destroy(&pm->lock); return -ENOMEM; } + pm->pmf = kzalloc(sizeof(struct packet_manager_firmware), GFP_KERNEL); pm->allocated = false; + switch (pm->dqm->dev->device_info->asic_family) { + case CHIP_KAVERI: + if (fw_ver >= KFD_SCRATCH_KV_FW_VER) { + pm->pmf->map_process = pm_create_map_process_scratch_kv; + pm->pmf->get_map_process_packet_size = + get_map_process_packet_size_scratch_kv; + } else { + pm->pmf->map_process = pm_create_map_process; + pm->pmf->get_map_process_packet_size = + get_map_process_packet_size; + } + break; + case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: + if (fw_ver >= KFD_SCRATCH_CZ_FW_VER) { + pm->pmf->map_process = pm_create_map_process_scratch; + pm->pmf->get_map_process_packet_size = + get_map_process_packet_size_scratch; + } else { + pm->pmf->map_process = pm_create_map_process; + pm->pmf->get_map_process_packet_size = + get_map_process_packet_size; + } + break; + + } + return 0; } @@ -445,6 +604,7 @@ void pm_uninit(struct packet_manager *pm) mutex_destroy(&pm->lock); kernel_queue_uninit(pm->priv_queue); + kfree(pm->pmf); } int pm_send_set_resources(struct packet_manager *pm, @@ -577,7 +737,7 @@ fail_acquire_packet_buffer: } int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - enum kfd_preempt_type_filter mode, + enum kfd_unmap_queues_filter filter, uint32_t filter_param, bool reset, unsigned int sdma_engine) { @@ -597,8 +757,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, packet = (struct pm4_unmap_queues *)buffer; memset(buffer, 0, sizeof(struct pm4_unmap_queues)); - pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n", - mode, reset, type); + pr_debug("kfd: static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", + filter, reset, type); packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, sizeof(struct pm4_unmap_queues)); switch (type) { @@ -623,26 +783,26 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, packet->bitfields2.action = action__mes_unmap_queues__preempt_queues; - switch (mode) { - case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: + switch (filter) { + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: packet->bitfields2.queue_sel = queue_sel__mes_unmap_queues__perform_request_on_specified_queues; packet->bitfields2.num_queues = 1; packet->bitfields3b.doorbell_offset0 = filter_param; break; - case KFD_PREEMPT_TYPE_FILTER_BY_PASID: + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: packet->bitfields2.queue_sel = queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; packet->bitfields3a.pasid = filter_param; break; - case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: packet->bitfields2.queue_sel = queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; break; - case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES: + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: /* in this case, we do not preempt static queues */ - packet->bitfields2.queue_sel = - queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only; break; default: BUG(); @@ -670,3 +830,4 @@ void pm_release_ib(struct packet_manager *pm) } mutex_unlock(&pm->lock); } + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h index 5b393f3e34a9..e7570ccdc5ad 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h @@ -127,7 +127,8 @@ struct pm4_runlist { uint32_t offload_polling:1; uint32_t reserved3:1; uint32_t valid:1; - uint32_t reserved4:8; + uint32_t process_cnt:4; + uint32_t reserved4:4; } bitfields4; uint32_t ordinal4; }; @@ -186,6 +187,123 @@ struct pm4_map_process { }; #endif +/*--------------------MES_MAP_PROCESS_SCRATCH-------------------- */ + +#ifndef PM4_MES_MAP_PROCESS_SCRATCH_DEFINED +#define PM4_MES_MAP_PROCESS_SCRATCH_DEFINED + +struct pm4_map_process_scratch { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t page_table_base:28; + uint32_t reserved3:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t reserved; + + uint32_t sh_mem_bases; + uint32_t sh_mem_config; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + + uint32_t sh_hidden_private_base_vmid; + + uint32_t reserved2; + uint32_t reserved3; + + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved4:2; + uint32_t num_oac:4; + uint32_t reserved5:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields10; + uint32_t ordinal10; + }; + + uint32_t completion_signal_lo; + uint32_t completion_signal_hi; + +}; +#endif + +#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH +#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH + +struct pm4_map_process_scratch_kv { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t page_table_base:28; + uint32_t reserved2:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t reserved3; + uint32_t sh_mem_bases; + uint32_t sh_mem_config; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t sh_hidden_private_base_vmid; + uint32_t reserved4; + uint32_t reserved5; + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved6:2; + uint32_t num_oac:4; + uint32_t reserved7:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields14; + uint32_t ordinal14; + }; + + uint32_t completion_signal_lo32; +uint32_t completion_signal_hi32; +}; +#endif + /*--------------------MES_MAP_QUEUES--------------------*/ #ifndef PM4_MES_MAP_QUEUES_DEFINED diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 80113c335966..92bba461e1e0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -30,13 +30,45 @@ #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/spinlock.h> +#include <linux/idr.h> #include <linux/kfd_ioctl.h> +#include <linux/pid.h> +#include <linux/interval_tree.h> #include <kgd_kfd_interface.h> +#include "amd_rdma.h" + #define KFD_SYSFS_FILE_MODE 0444 -#define KFD_MMAP_DOORBELL_MASK 0x8000000000000 -#define KFD_MMAP_EVENTS_MASK 0x4000000000000 +/* GPU ID hash width in bits */ +#define KFD_GPU_ID_HASH_WIDTH 16 + +/* Use upper bits of mmap offset to store KFD driver specific information. + * BITS[63:62] - Encode MMAP type + * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to + * BITS[45:40] - Reserved. Not Used. + * BITS[39:0] - MMAP offset value. Used by TTM. + * + * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these + * defines are w.r.t to PAGE_SIZE + */ +#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) +#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_MAP_BO (0x1ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_RESERVED_MEM (0x0ULL << KFD_MMAP_TYPE_SHIFT) + +#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) +#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ + << KFD_MMAP_GPU_ID_SHIFT) +#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ + & KFD_MMAP_GPU_ID_MASK) +#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ + >> KFD_MMAP_GPU_ID_SHIFT) + +#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) +#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) /* * When working with cp scheduler we should assign the HIQ manually or via @@ -48,8 +80,6 @@ #define KFD_CIK_HIQ_PIPE 4 #define KFD_CIK_HIQ_QUEUE 0 -/* GPU ID hash width in bits */ -#define KFD_GPU_ID_HASH_WIDTH 16 /* Macro for allocating structures */ #define kfd_alloc_struct(ptr_to_struct) \ @@ -74,12 +104,26 @@ extern int max_num_of_queues_per_device; /* Kernel module parameter to specify the scheduling policy */ extern int sched_policy; +extern int cwsr_enable; + +/* + * Kernel module parameter to specify the maximum process + * number per HW scheduler + */ +extern int hws_max_conc_proc; + /* * Kernel module parameter to specify whether to send sigterm to HSA process on * unhandled exception */ extern int send_sigterm; +/* + * This kernel module is used to simulate large bar machine on non-large bar + * enabled machines. + */ +extern int debug_largebar; + /** * enum kfd_sched_policy * @@ -114,14 +158,17 @@ enum cache_policy { enum asic_family_type { CHIP_KAVERI = 0, - CHIP_CARRIZO + CHIP_CARRIZO, + CHIP_TONGA, + CHIP_FIJI }; +#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_FIJI) +#define KFD_IS_DGPU(chip) ((chip) >= CHIP_TONGA && (chip) <= CHIP_FIJI) + struct kfd_event_interrupt_class { - bool (*interrupt_isr)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry); - void (*interrupt_wq)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry); + bool (*interrupt_isr)(struct kfd_dev *dev, const uint32_t *ih_ring_entry); + void (*interrupt_wq)(struct kfd_dev *dev, const uint32_t *ih_ring_entry); }; struct kfd_device_info { @@ -132,6 +179,7 @@ struct kfd_device_info { size_t ih_ring_entry_size; uint8_t num_of_watch_points; uint16_t mqd_size_aligned; + bool is_need_iommu_device; }; struct kfd_mem_obj { @@ -141,6 +189,12 @@ struct kfd_mem_obj { uint32_t *cpu_ptr; }; +struct kfd_vmid_info { + uint32_t first_vmid_kfd; + uint32_t last_vmid_kfd; + uint32_t vmid_num_kfd; +}; + struct kfd_dev { struct kgd_dev *kgd; @@ -165,11 +219,12 @@ struct kfd_dev { */ struct kgd2kfd_shared_resources shared_resources; + struct kfd_vmid_info vm_info; const struct kfd2kgd_calls *kfd2kgd; struct mutex doorbell_mutex; - DECLARE_BITMAP(doorbell_available_index, - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + unsigned long doorbell_available_index[DIV_ROUND_UP( + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; void *gtt_mem; uint64_t gtt_start_gpu_addr; @@ -179,6 +234,11 @@ struct kfd_dev { unsigned int gtt_sa_chunk_size; unsigned int gtt_sa_num_of_chunks; + /* QCM Device instance */ + struct device_queue_manager *dqm; + + bool init_complete; + /* Interrupts */ void *interrupt_ring; size_t interrupt_ring_size; @@ -187,10 +247,6 @@ struct kfd_dev { struct work_struct interrupt_work; spinlock_t interrupt_lock; - /* QCM Device instance */ - struct device_queue_manager *dqm; - - bool init_complete; /* * Interrupts of interest to KFD are copied * from the HW ring into a SW ring. @@ -198,7 +254,26 @@ struct kfd_dev { bool interrupts_active; /* Debug manager */ - struct kfd_dbgmgr *dbgmgr; + struct kfd_dbgmgr *dbgmgr; + + /* MEC firmware version*/ + uint16_t mec_fw_version; + + /* Maximum process number mapped to HW scheduler */ + unsigned int max_proc_per_quantum; + + /* cwsr */ + bool cwsr_enabled; + struct page *cwsr_pages; + uint32_t cwsr_size; + uint32_t tma_offset; /*Offset for TMA from the start of cwsr_mem*/ +}; + +struct kfd_bo { + void *mem; + struct interval_tree_node it; + struct kfd_dev *dev; + struct list_head cb_data_head; }; /* KGD2KFD callbacks */ @@ -221,22 +296,22 @@ void kfd_chardev_exit(void); struct device *kfd_chardev(void); /** - * enum kfd_preempt_type_filter + * enum kfd_unmap_queues_filter * - * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. + * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. * - * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the + * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the * running queues list. * - * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to + * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to * specific process. * */ -enum kfd_preempt_type_filter { - KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, - KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, - KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, - KFD_PREEMPT_TYPE_FILTER_BY_PASID +enum kfd_unmap_queues_filter { + KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + KFD_UNMAP_QUEUES_FILTER_BY_PASID }; enum kfd_preempt_type { @@ -324,6 +399,7 @@ struct queue_properties { uint32_t __iomem *doorbell_ptr; uint32_t doorbell_off; bool is_interop; + bool is_evicted; /* true -> queue is evicted */ bool is_active; /* Not relevant for user mode queues in cp scheduling */ unsigned int vmid; @@ -336,6 +412,11 @@ struct queue_properties { uint32_t eop_ring_buffer_size; uint64_t ctx_save_restore_area_address; uint32_t ctx_save_restore_area_size; + uint32_t ctl_stack_size; + uint64_t tba_addr; + uint64_t tma_addr; + /* Relevant for CU */ + uint32_t cu_mask; }; /** @@ -424,6 +505,7 @@ struct qcm_process_device { unsigned int queue_count; unsigned int vmid; bool is_debug; + unsigned evicted; /* eviction counter, 0=active */ /* * All the memory management data should be here too */ @@ -436,8 +518,22 @@ struct qcm_process_device { uint32_t gds_size; uint32_t num_gws; uint32_t num_oac; + uint32_t sh_hidden_private_base; + + /*cwsr memory*/ + int cwsr_mem_handle; + uint64_t cwsr_base; + uint64_t tba_addr; + uint64_t tma_addr; + void *cwsr_kaddr; }; +/*8 byte handle containing GPU ID in the most significant 4 bytes and + * idr_handle in the least significant 4 bytes*/ +#define MAKE_HANDLE(gpu_id, idr_handle) (((uint64_t)(gpu_id) << 32) + idr_handle) +#define GET_GPU_ID(handle) (handle >> 32) +#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) + /* Data that is per-process-per device. */ struct kfd_process_device { /* @@ -449,6 +545,8 @@ struct kfd_process_device { /* The device that owns this data. */ struct kfd_dev *dev; + /* The process that owns this kfd_process_device. */ + struct kfd_process *process; /* per-process-per device QCM data structure */ struct qcm_process_device qpd; @@ -460,10 +558,23 @@ struct kfd_process_device { uint64_t gpuvm_limit; uint64_t scratch_base; uint64_t scratch_limit; + uint64_t dgpu_base; + uint64_t dgpu_limit; + uint64_t mapped_size; + uint64_t last_eviction; + bool evicted; + + uint64_t sh_hidden_private_base_vmid; /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ bool bound; + /* VM context for GPUVM allocations */ + void *vm; + + /* GPUVM allocations storage */ + struct idr alloc_idr; + /* This flag tells if we should reset all * wavefronts on process termination */ @@ -482,7 +593,7 @@ struct kfd_process { struct mm_struct *mm; - struct mutex mutex; + struct rw_semaphore lock; /* * In any process, the thread that started main() is the lead @@ -513,6 +624,8 @@ struct kfd_process { /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ struct kfd_queue **queues; + unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; + /*Is the user space process 32 bit?*/ bool is_32bit_user_mode; @@ -520,10 +633,12 @@ struct kfd_process { struct mutex event_mutex; /* All events in process hashed by ID, linked on kfd_event.events. */ DECLARE_HASHTABLE(events, 4); - struct list_head signal_event_pages; /* struct slot_page_header. - event_pages */ + struct list_head signal_event_pages; /* struct slot_page_header.event_pages */ u32 next_nonsignal_event_id; size_t signal_event_count; + size_t debug_event_count; + + struct rb_root bo_interval_tree; }; /** @@ -546,9 +661,10 @@ struct amdkfd_ioctl_desc { void kfd_process_create_wq(void); void kfd_process_destroy_wq(void); -struct kfd_process *kfd_create_process(const struct task_struct *); +struct kfd_process *kfd_create_process(struct file *filep); struct kfd_process *kfd_get_process(const struct task_struct *); struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); +struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p); @@ -558,6 +674,29 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process *p); +int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma); + +/* KFD process API for creating and translating handles */ +int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, + void *mem, uint64_t start, + uint64_t length); +void *kfd_process_device_translate_handle(struct kfd_process_device *p, + int handle); +struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, + int handle); +void *kfd_process_find_bo_from_interval(struct kfd_process *p, + uint64_t start_addr, + uint64_t last_addr); +void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, + int handle); + +void run_rdma_free_callback(struct kfd_bo *buf_obj); +struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); + +/* kfd dgpu memory */ +int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem, + struct kfd_process *p, struct kfd_process_device *pdd); + /* Process device data iterator */ struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, @@ -600,7 +739,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu); int kfd_topology_remove_device(struct kfd_dev *gpu); struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); -struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); +struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); +uint32_t kfd_get_gpu_id(struct kfd_dev *dev); +int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); +int kfd_numa_node_to_apic_id(int numa_node_id); +int kfd_get_proximity_domain(const struct pci_bus *bus); /* Interrupts */ int kfd_interrupt_init(struct kfd_dev *dev); @@ -615,11 +758,13 @@ int kgd2kfd_resume(struct kfd_dev *kfd); /* amdkfd Apertures */ int kfd_init_apertures(struct kfd_process *process); +int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, + uint64_t base, uint64_t limit); /* Queue Context Management */ struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); -int init_queue(struct queue **q, struct queue_properties properties); +int init_queue(struct queue **q, const struct queue_properties *properties); void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); @@ -630,11 +775,15 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, enum kfd_queue_type type); void kernel_queue_uninit(struct kernel_queue *kq); +int kfd_process_vm_fault(struct device_queue_manager *dqm, + unsigned int pasid); /* Process Queue Manager */ struct process_queue_node { @@ -649,18 +798,16 @@ int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_dev *dev, struct file *f, struct queue_properties *properties, - unsigned int flags, - enum kfd_queue_type type, unsigned int *qid); int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); +int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); - -int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - unsigned int fence_value, - unsigned long timeout); +int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); +int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); /* Packet Manager */ @@ -668,7 +815,9 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, #define KFD_FENCE_COMPLETED (100) #define KFD_FENCE_INIT (10) -#define KFD_UNMAP_LATENCY (150) +#define KFD_UNMAP_LATENCY (40) + +struct packet_manager_firmware; struct packet_manager { struct device_queue_manager *dqm; @@ -676,9 +825,19 @@ struct packet_manager { struct mutex lock; bool allocated; struct kfd_mem_obj *ib_buffer_obj; + + struct packet_manager_firmware *pmf; }; -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); +struct packet_manager_firmware { + /* Support different firmware versions for map process packet */ + int (*map_process)(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd); + int (*get_map_process_packet_size)(void); +}; + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, + uint16_t fw_ver); void pm_uninit(struct packet_manager *pm); int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res); @@ -687,7 +846,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint32_t fence_value); int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, - enum kfd_preempt_type_filter mode, + enum kfd_unmap_queues_filter mode, uint32_t filter_param, bool reset, unsigned int sdma_engine); @@ -696,6 +855,9 @@ void pm_release_ib(struct packet_manager *pm); uint64_t kfd_get_number_elems(struct kfd_dev *kfd); phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, struct kfd_process *process); +int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, + unsigned long timeout); /* Events */ extern const struct kfd_event_interrupt_class event_interrupt_class_cik; @@ -714,8 +876,7 @@ int kfd_wait_on_events(struct kfd_process *p, uint32_t num_events, void __user *data, bool all, uint32_t user_timeout_ms, enum kfd_event_wait_result *wait_result); -void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, - uint32_t valid_id_bits); +void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits); void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, unsigned long address, bool is_write_requested, bool is_execute_requested); @@ -723,11 +884,28 @@ void kfd_signal_hw_exception_event(unsigned int pasid); int kfd_set_event(struct kfd_process *p, uint32_t event_id); int kfd_reset_event(struct kfd_process *p, uint32_t event_id); int kfd_event_create(struct file *devkfd, struct kfd_process *p, - uint32_t event_type, bool auto_reset, uint32_t node_id, - uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index); + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, + uint64_t *event_page_offset, uint32_t *event_slot_index, + void *kern_addr); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); +void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle); + +void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + struct kfd_vm_fault_info *info); + +void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); +int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem); +int kgd2kfd_restore(struct kfd_dev *kfd); +int evict_size(struct kfd_process *p, int size, int type); +int evict_bo(struct kfd_dev *dev, void *mem); +int restore(struct kfd_dev *kfd); + +#define KFD_SCRATCH_CZ_FW_VER 600 +#define KFD_SCRATCH_KV_FW_VER 413 +#define KFD_MULTI_PROC_MAPPING_HWS_SUPPORT 600 +#define KFD_CWSR_CZ_FW_VER 625 #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 4f3849ac8c07..dfd2e0d4f544 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -27,6 +27,10 @@ #include <linux/amd-iommu.h> #include <linux/notifier.h> #include <linux/compat.h> +#include <linux/mm.h> +#include <asm/tlb.h> +#include <linux/highmem.h> +#include <uapi/asm-generic/mman-common.h> struct mm_struct; @@ -40,6 +44,7 @@ struct mm_struct; */ #define INITIAL_QUEUE_ARRAY_SIZE 16 +static int evict_pdd(struct kfd_process_device *pdd); /* * List of struct kfd_process (field kfd_process). * Unique/indexed by mm_struct* @@ -57,8 +62,14 @@ struct kfd_process_release_work { struct kfd_process *p; }; -static struct kfd_process *find_process(const struct task_struct *thread); +#define MIN_IDR_ID 1 +#define MAX_IDR_ID 0 /*0 - for unlimited*/ + +static struct kfd_process *find_process(const struct task_struct *thread, + bool lock); static struct kfd_process *create_process(const struct task_struct *thread); +static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); + void kfd_process_create_wq(void) { @@ -74,10 +85,12 @@ void kfd_process_destroy_wq(void) } } -struct kfd_process *kfd_create_process(const struct task_struct *thread) +struct kfd_process *kfd_create_process(struct file *filep) { struct kfd_process *process; + struct task_struct *thread = current; + BUG_ON(!kfd_process_wq); if (thread->mm == NULL) @@ -98,7 +111,7 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) mutex_lock(&kfd_processes_mutex); /* A prior open of /dev/kfd could have already created the process. */ - process = find_process(thread); + process = find_process(thread, false); if (process) pr_debug("kfd: process already found\n"); @@ -109,6 +122,8 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) up_write(&thread->mm->mmap_sem); + kfd_process_init_cwsr(process, filep); + return process; } @@ -123,7 +138,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) if (thread->group_leader->mm != thread->mm) return ERR_PTR(-EINVAL); - process = find_process(thread); + process = find_process(thread, false); return process; } @@ -140,23 +155,164 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) return NULL; } -static struct kfd_process *find_process(const struct task_struct *thread) +static struct kfd_process *find_process(const struct task_struct *thread, + bool lock) { struct kfd_process *p; int idx; idx = srcu_read_lock(&kfd_processes_srcu); p = find_process_by_mm(thread->mm); + if (p && lock) + down_read(&p->lock); srcu_read_unlock(&kfd_processes_srcu, idx); return p; } +/* This returns with process->lock read-locked. */ +struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) +{ + struct task_struct *task = NULL; + struct kfd_process *p = NULL; + + if (!pid) + task = current; + else + task = get_pid_task(pid, PIDTYPE_PID); + + if (task) + p = find_process(task, true); + + return p; +} + +int evict_size(struct kfd_process *process, int size, int type) +{ + struct kfd_process_device *pdd, *temp_pdd = NULL; + struct kfd_process *p = process; + int temp = 0; + + down_write(&p->lock); + + if (type == EVICT_FIRST_PDD) { + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); + if (pdd->mapped_size >= size) { + evict_pdd(pdd); + return 0; + } + + } + } else if (type == EVICT_BIGGEST_PDD) { + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); + if (pdd->mapped_size >= temp) { + temp = pdd->mapped_size; + temp_pdd = pdd; + } + + } + if (temp_pdd->mapped_size > size) { + evict_pdd(temp_pdd); + return 0; + } + + } + up_write(&p->lock); + return 0; + +} + +int evict_bo(struct kfd_dev *dev, void *mem) +{ + struct kfd_process_device *pdd; + + pdd = dev->kfd2kgd->get_pdd_from_buffer_object(dev->kgd, + ((struct kgd_mem *)mem)); + + if (pdd) + evict_pdd(pdd); + + return 0; +} + +static int evict_pdd(struct kfd_process_device *pdd) +{ + void *mem; + int id; + + /*process_evict_queues(struct device_queue_manager *dqm, pdd->qpd)*/ + /* + * Remove all handles from idr and release appropriate + * local memory object + */ + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + pdd->dev->kfd2kgd->unmap_memory_to_gpu( + pdd->dev->kgd, mem, pdd->vm); + } + pdd->last_eviction = jiffies; + pdd->mapped_size = 0; + pdd->evicted = true; + + /*flush_tlb_all();*/ + + return 0; +} + +int restore(struct kfd_dev *kfd) +{ + struct kfd_process *p = NULL; + /* TODO still working on how to get the process */ + struct kfd_process_device *pdd = kfd_get_process_device_data(kfd, p); + void *mem; + int id; + + /* need to run on all processes*/ + down_write(&p->lock); + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); + + /* + * Remove all handles from idr and release appropriate + * local memory object + */ + if (pdd->evicted) { + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + pdd->dev->kfd2kgd->map_memory_to_gpu( + pdd->dev->kgd, + mem, pdd->vm); + pdd->last_eviction = 0; + pdd->mapped_size = 0; + } + + /*process_restore_queues + * (struct device_queue_manager *dqm, pdd->qpd)*/ + } else { + pdd->evicted = false; + } + } + up_write(&p->lock); + return 0; +} + +/* No process locking is needed in this function, because the process + * is not findable any more. We must assume that no other thread is + * using it any more, otherwise we couldn't safely free the process + * stucture in the end. */ static void kfd_process_wq_release(struct work_struct *work) { struct kfd_process_release_work *my_work; - struct kfd_process_device *pdd, *temp; + struct kfd_process_device *pdd, *temp, *peer_pdd; struct kfd_process *p; + struct kfd_bo *buf_obj; + int id; my_work = (struct kfd_process_release_work *) work; @@ -165,19 +321,40 @@ static void kfd_process_wq_release(struct work_struct *work) pr_debug("Releasing process (pasid %d) in workqueue\n", p->pasid); - mutex_lock(&p->mutex); - - list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", pdd->dev->id, p->pasid); - if (pdd->reset_wavefronts) - dbgdev_wave_reset_wavefronts(pdd->dev, p); + if (pdd->dev->device_info->is_need_iommu_device) + amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); + + /* + * Remove all handles from idr and release appropriate + * local memory object + */ + idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { + list_for_each_entry(peer_pdd, + &p->per_device_data, per_device_list) { + pdd->dev->kfd2kgd->unmap_memory_to_gpu( + peer_pdd->dev->kgd, + buf_obj->mem, peer_pdd->vm); + } - amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); - list_del(&pdd->per_device_list); + run_rdma_free_callback(buf_obj); + pdd->dev->kfd2kgd->free_memory_of_gpu( + pdd->dev->kgd, buf_obj->mem); + kfd_process_device_remove_obj_handle(pdd, id); + } + } + list_for_each_entry_safe(pdd, temp, &p->per_device_data, + per_device_list) { + radeon_flush_tlb(pdd->dev, p->pasid); + /* Destroy the GPUVM VM context */ + if (pdd->vm) + pdd->dev->kfd2kgd->destroy_process_vm( + pdd->dev->kgd, pdd->vm); + list_del(&pdd->per_device_list); kfree(pdd); } @@ -185,15 +362,11 @@ static void kfd_process_wq_release(struct work_struct *work) kfd_pasid_free(p->pasid); - mutex_unlock(&p->mutex); - - mutex_destroy(&p->mutex); - kfree(p->queues); kfree(p); - kfree(work); + kfree((void *)work); } static void kfd_process_destroy_delayed(struct rcu_head *rcu) @@ -222,6 +395,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, { struct kfd_process *p; struct kfd_process_device *pdd = NULL; + struct kfd_dev *dev = NULL; + long status = -EFAULT; /* * The kfd_process structure can not be free because the @@ -235,9 +410,31 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, mutex_unlock(&kfd_processes_mutex); synchronize_srcu(&kfd_processes_srcu); - mutex_lock(&p->mutex); + down_write(&p->lock); + + /* Iterate over all process device data structures and if the pdd is in + * debug mode,we should first force unregistration, then we will be + * able to destroy the queues */ + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + dev = pdd->dev; + mutex_lock(get_dbgmgr_mutex()); + + if ((dev != NULL) && + (dev->dbgmgr) && + (dev->dbgmgr->pasid == p->pasid)) { + + status = kfd_dbgmgr_unregister(dev->dbgmgr, p); + if (status == 0) { + kfd_dbgmgr_destroy(dev->dbgmgr); + dev->dbgmgr = NULL; + } + } + mutex_unlock(get_dbgmgr_mutex()); + } + + + /* now we can uninit the pqm: */ - /* In case our notifier is called before IOMMU notifier */ pqm_uninit(&p->pqm); /* Iterate over all process device data structure and check @@ -271,6 +468,94 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { .release = kfd_process_notifier_release, }; +static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) +{ + int err; + unsigned long offset; + struct kfd_process_device *temp, *pdd = NULL; + void *mem = NULL; + struct kfd_dev *dev = NULL; + struct qcm_process_device *qpd = NULL; + + down_write(&p->lock); + list_for_each_entry_safe(pdd, temp, &p->per_device_data, + per_device_list) { + dev = pdd->dev; + qpd = &pdd->qpd; + if (!dev->cwsr_enabled || qpd->tba_addr) + continue; + if (qpd->cwsr_base) { + /* cwsr_base is only set for DGPU */ + + /* can't hold the process lock while + * allocating from KGD */ + up_write(&p->lock); + + err = dev->kfd2kgd->alloc_memory_of_gpu( + dev->kgd, qpd->cwsr_base, dev->cwsr_size, + pdd->vm, (struct kgd_mem **)&mem, + NULL, &qpd->cwsr_kaddr, pdd, + ALLOC_MEM_FLAGS_GTT | + ALLOC_MEM_FLAGS_NONPAGED | + ALLOC_MEM_FLAGS_EXECUTE_ACCESS | + ALLOC_MEM_FLAGS_NO_SUBSTITUTE); + if (err) + goto err_alloc_tba; + err = kfd_map_memory_to_gpu(dev, mem, p, pdd); + if (err) + goto err_map_tba; + + down_write(&p->lock); + /* Check if someone else allocated the memory + * while we weren't looking */ + if (qpd->tba_addr) { + up_write(&p->lock); + dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, + (struct kgd_mem *)mem, pdd->vm); + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); + down_write(&p->lock); + } else { + qpd->cwsr_mem_handle = + kfd_process_device_create_obj_handle( + pdd, mem, qpd->cwsr_base, + dev->cwsr_size); + if (qpd->cwsr_mem_handle < 0) + goto err_create_handle; + + memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages), + PAGE_SIZE); + kunmap(dev->cwsr_pages); + qpd->tba_addr = qpd->cwsr_base; + } + } else { + offset = (kfd_get_gpu_id(dev) | + KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; + qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, + dev->cwsr_size, PROT_READ | PROT_EXEC, + MAP_SHARED, offset); + qpd->cwsr_kaddr = (void *)qpd->tba_addr; + } + if (IS_ERR_VALUE(qpd->tba_addr)) { + pr_err("Failure to set tba address. error -%d.\n", + (int)qpd->tba_addr); + qpd->tba_addr = 0; + qpd->cwsr_kaddr = NULL; + } else + qpd->tma_addr = qpd->tba_addr + dev->tma_offset; + pr_debug("set tba :0x%llx, tma:0x%llx for pqm.\n", + qpd->tba_addr, qpd->tma_addr); + } + +err_create_handle: + up_write(&p->lock); + return err; + +err_map_tba: + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); +err_alloc_tba: + return err; +} + static struct kfd_process *create_process(const struct task_struct *thread) { struct kfd_process *process; @@ -281,6 +566,8 @@ static struct kfd_process *create_process(const struct task_struct *thread) if (!process) goto err_alloc_process; + process->bo_interval_tree = RB_ROOT; + process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, sizeof(process->queues[0]), GFP_KERNEL); if (!process->queues) @@ -290,7 +577,7 @@ static struct kfd_process *create_process(const struct task_struct *thread) if (process->pasid == 0) goto err_alloc_pasid; - mutex_init(&process->mutex); + init_rwsem(&process->lock); process->mm = thread->mm; @@ -362,8 +649,22 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, INIT_LIST_HEAD(&pdd->qpd.queues_list); INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); pdd->qpd.dqm = dev->dqm; + pdd->qpd.pqm = &p->pqm; + pdd->qpd.evicted = 0; pdd->reset_wavefronts = false; + pdd->process = p; list_add(&pdd->per_device_list, &p->per_device_data); + + /* Init idr used for memory handle translation */ + idr_init(&pdd->alloc_idr); + + /* Create the GPUVM context for this specific device */ + if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm)) { + pr_err("Failed to create process VM object\n"); + list_del(&pdd->per_device_list); + kfree(pdd); + pdd = NULL; + } } return pdd; @@ -391,9 +692,11 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, if (pdd->bound) return pdd; - err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); - if (err < 0) - return ERR_PTR(err); + if (dev->device_info->is_need_iommu_device) { + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); + if (err < 0) + return ERR_PTR(err); + } pdd->bound = true; @@ -405,6 +708,7 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) struct kfd_process *p; struct kfd_process_device *pdd; int idx, i; + long status = -EFAULT; BUG_ON(dev == NULL); @@ -478,7 +782,116 @@ bool kfd_has_process_device_data(struct kfd_process *p) return !(list_empty(&p->per_device_data)); } -/* This returns with process->mutex locked. */ +/* Create specific handle mapped to mem from process local memory idr + * Assumes that the process lock is held. */ +int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, + void *mem, uint64_t start, + uint64_t length) +{ + int handle; + struct kfd_bo *buf_obj; + struct kfd_process *p; + + BUG_ON(pdd == NULL); + BUG_ON(mem == NULL); + + p = pdd->process; + + buf_obj = kmalloc(sizeof(*buf_obj), GFP_KERNEL); + + if (!buf_obj) + return -ENOMEM; + + buf_obj->it.start = start; + buf_obj->it.last = start + length - 1; + interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); + + buf_obj->mem = mem; + buf_obj->dev = pdd->dev; + + INIT_LIST_HEAD(&buf_obj->cb_data_head); + + idr_preload(GFP_KERNEL); + + handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, + GFP_NOWAIT); + + idr_preload_end(); + + return handle; +} + +struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, + int handle) +{ + BUG_ON(pdd == NULL); + + if (handle < 0) + return NULL; + + return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); +} + +/* Translate specific handle from process local memory idr + * Assumes that the process lock is held. */ +void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, + int handle) +{ + struct kfd_bo *buf_obj; + + buf_obj = kfd_process_device_find_bo(pdd, handle); + + return buf_obj->mem; +} + +void *kfd_process_find_bo_from_interval(struct kfd_process *p, + uint64_t start_addr, + uint64_t last_addr) +{ + struct interval_tree_node *it_node; + struct kfd_bo *buf_obj; + + it_node = interval_tree_iter_first(&p->bo_interval_tree, + start_addr, last_addr); + if (!it_node) { + pr_err("%llu - %llu does not relate to an existing buffer\n", + start_addr, last_addr); + return NULL; + } + + BUG_ON(NULL != interval_tree_iter_next(it_node, + start_addr, last_addr)); + + buf_obj = container_of(it_node, struct kfd_bo, it); + + return buf_obj; +} + +/* Remove specific handle from process local memory idr + * Assumes that the process lock is held. */ +void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, + int handle) +{ + struct kfd_bo *buf_obj; + struct kfd_process *p; + + BUG_ON(pdd == NULL); + + p = pdd->process; + + if (handle < 0) + return; + + buf_obj = kfd_process_device_find_bo(pdd, handle); + + idr_remove(&pdd->alloc_idr, handle); + + interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); + + kfree(buf_obj); +} + +/* This returns with process->lock read-locked. */ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) { struct kfd_process *p; @@ -488,7 +901,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (p->pasid == pasid) { - mutex_lock(&p->mutex); + down_read(&p->lock); break; } } @@ -497,3 +910,53 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) return p; } + +/* This returns with process->lock read-locked. */ +struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) +{ + struct kfd_process *p; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + p = find_process_by_mm(mm); + if (p != NULL) + down_read(&p->lock); + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; +} + +int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma) +{ + unsigned long pfn, i; + int ret = 0; + struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); + + if (dev == NULL) + return -EINVAL; + if ((vma->vm_start & (PAGE_SIZE - 1)) || + (vma->vm_end & (PAGE_SIZE - 1))) { + pr_err("KFD only support page aligned memory map.\n"); + return -EINVAL; + } + + pr_debug("kfd reserved mem mmap been called.\n"); + /* We supported two reserved memory mmap in the future . + 1. Trap handler code and parameter (TBA and TMA , 2 pages total) + 2. Relaunch stack (control block, 1 page for Carrizo) + */ + + for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) { + pfn = page_to_pfn(&dev->cwsr_pages[i]); + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; + /* mapping the page to user process */ + ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT), + pfn, PAGE_SIZE, vma->vm_page_prot); + if (ret) + break; + } + return ret; +} + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 7b69070f7ecc..8e2c9a7d8957 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -89,23 +89,36 @@ void pqm_uninit(struct process_queue_manager *pqm) { int retval; struct process_queue_node *pqn, *next; + struct kfd_process_device *pdd; + struct kfd_dev *dev = NULL; BUG_ON(!pqm); pr_debug("In func %s\n", __func__); list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { - retval = pqm_destroy_queue( - pqm, - (pqn->q != NULL) ? - pqn->q->properties.queue_id : - pqn->kq->queue->properties.queue_id); - - if (retval != 0) { - pr_err("kfd: failed to destroy queue\n"); - return; + if (pqn->q) + dev = pqn->q->device; + else if (pqn->kq) + dev = pqn->kq->dev; + else + BUG(); + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (pdd) { + retval = dev->dqm->ops.process_termination + (dev->dqm, &pdd->qpd); + if (retval != 0) + pdd->reset_wavefronts = true; } } + + list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { + uninit_queue(pqn->q); + list_del(&pqn->process_queue_list); + kfree(pqn); + } + kfree(pqm->queue_slot_bitmap); pqm->queue_slot_bitmap = NULL; } @@ -129,7 +142,7 @@ static int create_cp_queue(struct process_queue_manager *pqm, q_properties->vmid = 0; q_properties->queue_id = qid; - retval = init_queue(q, *q_properties); + retval = init_queue(q, q_properties); if (retval != 0) goto err_init_queue; @@ -148,23 +161,19 @@ int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_dev *dev, struct file *f, struct queue_properties *properties, - unsigned int flags, - enum kfd_queue_type type, unsigned int *qid) { int retval; struct kfd_process_device *pdd; - struct queue_properties q_properties; struct queue *q; struct process_queue_node *pqn; struct kernel_queue *kq; int num_queues = 0; struct queue *cur; + enum kfd_queue_type type = properties->type; BUG_ON(!pqm || !dev || !properties || !qid); - memset(&q_properties, 0, sizeof(struct queue_properties)); - memcpy(&q_properties, properties, sizeof(struct queue_properties)); q = NULL; kq = NULL; @@ -192,10 +201,9 @@ int pqm_create_queue(struct process_queue_manager *pqm, if (retval != 0) return retval; - if (list_empty(&pqm->queues)) { - pdd->qpd.pqm = pqm; + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); - } pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); if (!pqn) { @@ -205,17 +213,34 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: + if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) { + pr_err("kfd: over-subscription is not allowed for SDMA.\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, + &q->properties.vmid); + pr_debug("DQM returned %d for create_queue\n", retval); + print_queue(q); + break; + case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && - ((dev->dqm->processes_count >= VMID_PER_DEVICE) || + ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || (dev->dqm->queue_count >= PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE))) { pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); retval = -EPERM; goto err_create_queue; } - retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); + retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); if (retval != 0) goto err_create_queue; pqn->q = q; @@ -252,9 +277,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, list_add(&pqn->process_queue_list, &pqm->queues); if (q) { - *properties = q->properties; pr_debug("kfd: PQM done creating queue\n"); - print_queue_properties(properties); + print_queue_properties(&q->properties); } return retval; @@ -264,7 +288,8 @@ err_create_queue: err_allocate_pqn: /* check if queues list is empty unregister process from device */ clear_bit(*qid, pqm->queue_slot_bitmap); - if (list_empty(&pqm->queues)) + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); return retval; } @@ -313,9 +338,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (pqn->q) { dqm = pqn->q->device->dqm; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); - if (retval != 0) + if (retval != 0) { + if (retval == -ETIME) + pdd->reset_wavefronts = true; return retval; - + } uninit_queue(pqn->q); } @@ -323,7 +350,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) kfree(pqn); clear_bit(qid, pqm->queue_slot_bitmap); - if (list_empty(&pqm->queues)) + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) dqm->ops.unregister_process(dqm, &pdd->qpd); return retval; @@ -357,6 +385,31 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, return 0; } +int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p) +{ + int retval; + struct process_queue_node *pqn; + + BUG_ON(!pqm); + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_debug("amdkfd: No queue %d exists for update operation\n", + qid); + return -EFAULT; + } + + pqn->q->properties.cu_mask = p->cu_mask; + + retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, + pqn->q); + if (retval != 0) + return retval; + + return 0; +} + struct kernel_queue *pqm_get_kernel_queue( struct process_queue_manager *pqm, unsigned int qid) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 9a0c90b0702e..0ab197077f2d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -63,7 +63,7 @@ void print_queue(struct queue *q) pr_debug("Queue Device Address: 0x%p\n", q->device); } -int init_queue(struct queue **q, struct queue_properties properties) +int init_queue(struct queue **q, const struct queue_properties *properties) { struct queue *tmp; @@ -73,7 +73,7 @@ int init_queue(struct queue **q, struct queue_properties properties) if (!tmp) return -ENOMEM; - memcpy(&tmp->properties, &properties, sizeof(struct queue_properties)); + memcpy(&tmp->properties, properties, sizeof(struct queue_properties)); *q = tmp; return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c new file mode 100644 index 000000000000..69bdaf12a9eb --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c @@ -0,0 +1,296 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/device.h> +#include <linux/export.h> +#include <linux/pid.h> +#include <linux/err.h> +#include <linux/slab.h> +#include "amd_rdma.h" +#include "kfd_priv.h" + + +struct rdma_cb { + struct list_head node; + struct amd_p2p_info amd_p2p_data; + void (*free_callback)(void *client_priv); + void *client_priv; +}; + +/** + * This function makes the pages underlying a range of GPU virtual memory + * accessible for DMA operations from another PCIe device + * + * \param address - The start address in the Unified Virtual Address + * space in the specified process + * \param length - The length of requested mapping + * \param pid - Pointer to structure pid to which address belongs. + * Could be NULL for current process address space. + * \param p2p_data - On return: Pointer to structure describing + * underlying pages/locations + * \param free_callback - Pointer to callback which will be called when access + * to such memory must be stopped immediately: Memory + * was freed, GECC events, etc. + * Client should immediately stop any transfer + * operations and returned as soon as possible. + * After return all resources associated with address + * will be release and no access will be allowed. + * \param client_priv - Pointer to be passed as parameter on + * 'free_callback; + * + * \return 0 if operation was successful + */ +static int get_pages(uint64_t address, uint64_t length, struct pid *pid, + struct amd_p2p_info **amd_p2p_data, + void (*free_callback)(void *client_priv), + void *client_priv) +{ + struct kfd_bo *buf_obj; + struct kgd_mem *mem; + struct sg_table *sg_table_tmp; + struct kfd_dev *dev; + uint64_t last = address + length - 1; + uint64_t offset; + struct kfd_process *p; + struct rdma_cb *rdma_cb_data; + int ret = 0; + + p = kfd_lookup_process_by_pid(pid); + if (!p) { + pr_err("could not find the process in %s.\n", + __func__); + return -EINVAL; + } + + buf_obj = kfd_process_find_bo_from_interval(p, address, last); + if (!buf_obj) { + pr_err("can not find a kfd_bo for the range\n"); + ret = -EINVAL; + goto out; + } + + rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); + if (!rdma_cb_data) { + *amd_p2p_data = NULL; + ret = -ENOMEM; + goto out; + } + + mem = buf_obj->mem; + dev = buf_obj->dev; + offset = address - buf_obj->it.start; + + ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, + offset, length, &sg_table_tmp); + + if (ret) { + pr_err("pin_get_sg_table_bo failed.\n"); + *amd_p2p_data = NULL; + goto free_mem; + } + + rdma_cb_data->amd_p2p_data.va = address; + rdma_cb_data->amd_p2p_data.size = length; + rdma_cb_data->amd_p2p_data.pid = pid; + rdma_cb_data->amd_p2p_data.priv = buf_obj; + rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; + + rdma_cb_data->free_callback = free_callback; + rdma_cb_data->client_priv = client_priv; + + list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); + + *amd_p2p_data = &rdma_cb_data->amd_p2p_data; + + goto out; + +free_mem: + kfree(rdma_cb_data); +out: + up_read(&p->lock); + + return ret; +} + +static int put_pages_helper(struct amd_p2p_info *p2p_data) +{ + struct kfd_bo *buf_obj; + struct kfd_dev *dev; + struct sg_table *sg_table_tmp; + struct rdma_cb *rdma_cb_data; + + if (!p2p_data) { + pr_err("amd_p2p_info pointer is invalid.\n"); + return -EINVAL; + } + + rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); + + buf_obj = p2p_data->priv; + dev = buf_obj->dev; + sg_table_tmp = p2p_data->pages; + + list_del(&rdma_cb_data->node); + kfree(rdma_cb_data); + + dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); + + + return 0; +} + +void run_rdma_free_callback(struct kfd_bo *buf_obj) +{ + struct rdma_cb *tmp, *rdma_cb_data; + + list_for_each_entry_safe(rdma_cb_data, tmp, + &buf_obj->cb_data_head, node) { + if (rdma_cb_data->free_callback) + rdma_cb_data->free_callback( + rdma_cb_data->client_priv); + + put_pages_helper(&rdma_cb_data->amd_p2p_data); + } +} + +/** + * + * This function release resources previously allocated by get_pages() call. + * + * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries + * allocated by get_pages() call. + * + * \return 0 if operation was successful + */ +static int put_pages(struct amd_p2p_info **p_p2p_data) +{ + struct kfd_process *p = NULL; + int ret = 0; + + if (!(*p_p2p_data)) { + pr_err("amd_p2p_info pointer is invalid.\n"); + return -EINVAL; + } + + p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); + if (!p) { + pr_err("could not find the process in %s\n", + __func__); + return -EINVAL; + } + + ret = put_pages_helper(*p_p2p_data); + + if (!ret) + *p_p2p_data = NULL; + + up_read(&p->lock); + + return ret; +} + +/** + * Check if given address belongs to GPU address space. + * + * \param address - Address to check + * \param pid - Process to which given address belongs. + * Could be NULL if current one. + * + * \return 0 - This is not GPU address managed by AMD driver + * 1 - This is GPU address managed by AMD driver + */ +static int is_gpu_address(uint64_t address, struct pid *pid) +{ + struct kfd_bo *buf_obj; + struct kfd_process *p; + + p = kfd_lookup_process_by_pid(pid); + if (!p) { + pr_err("could not find the process in %s.\n", + __func__); + return 0; + } + + buf_obj = kfd_process_find_bo_from_interval(p, address, address); + + up_read(&p->lock); + if (!buf_obj) + return 0; + else + return 1; +} + +/** + * Return the single page size to be used when building scatter/gather table + * for given range. + * + * \param address - Address + * \param length - Range length + * \param pid - Process id structure. Could be NULL if current one. + * \param page_size - On return: Page size + * + * \return 0 if operation was successful + */ +static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, + unsigned long *page_size) +{ + /* + * As local memory is always consecutive, we can assume the local + * memory page size to be arbitrary. + * Currently we assume the local memory page size to be the same + * as system memory, which is 4KB. + */ + *page_size = PAGE_SIZE; + + return 0; +} + + +/** + * Singleton object: rdma interface function pointers + */ +static const struct amd_rdma_interface rdma_ops = { + .get_pages = get_pages, + .put_pages = put_pages, + .is_gpu_address = is_gpu_address, + .get_page_size = get_page_size, +}; + +/** + * amdkfd_query_rdma_interface - Return interface (function pointers table) for + * rdma interface + * + * + * \param interace - OUT: Pointer to interface + * + * \return 0 if operation was successful. + */ +int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) +{ + *ops = &rdma_ops; + + return 0; +} +EXPORT_SYMBOL(amdkfd_query_rdma_interface); + + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 884c96f50c3d..4e357eb068bf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -28,16 +28,19 @@ #include <linux/hash.h> #include <linux/cpufreq.h> #include <linux/log2.h> +#include <linux/dmi.h> +#include <linux/atomic.h> #include "kfd_priv.h" #include "kfd_crat.h" #include "kfd_topology.h" -static struct list_head topology_device_list; -static int topology_crat_parsed; +/* topology_device_list - Master list of all topology devices */ +struct list_head topology_device_list; static struct kfd_system_properties sys_props; static DECLARE_RWSEM(topology_lock); +static atomic_t topology_crat_proximity_domain; struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) { @@ -57,311 +60,61 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) return device; } -struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) +uint32_t kfd_get_gpu_id(struct kfd_dev *dev) { struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; + uint32_t gpu_id = 0; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu->pdev == pdev) { - device = top_dev->gpu; + if (top_dev->gpu == dev) { + gpu_id = top_dev->gpu_id; break; } up_read(&topology_lock); - return device; -} - -static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) -{ - struct acpi_table_header *crat_table; - acpi_status status; - - if (!size) - return -EINVAL; - - /* - * Fetch the CRAT table from ACPI - */ - status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); - if (status == AE_NOT_FOUND) { - pr_warn("CRAT table not found\n"); - return -ENODATA; - } else if (ACPI_FAILURE(status)) { - const char *err = acpi_format_exception(status); - - pr_err("CRAT table error: %s\n", err); - return -EINVAL; - } - - if (*size >= crat_table->length && crat_image != NULL) - memcpy(crat_image, crat_table, crat_table->length); - - *size = crat_table->length; - - return 0; -} - -static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - BUG_ON(!dev); - BUG_ON(!cu); - - dev->node_props.cpu_cores_count = cu->num_cpu_cores; - dev->node_props.cpu_core_id_base = cu->processor_id_low; - if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; - - pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, - cu->processor_id_low); -} - -static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - BUG_ON(!dev); - BUG_ON(!cu); - - dev->node_props.simd_id_base = cu->processor_id_low; - dev->node_props.simd_count = cu->num_simd_cores; - dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; - dev->node_props.max_waves_per_simd = cu->max_waves_simd; - dev->node_props.wave_front_size = cu->wave_front_size; - dev->node_props.mem_banks_count = cu->num_banks; - dev->node_props.array_count = cu->num_arrays; - dev->node_props.cu_per_simd_array = cu->num_cu_per_array; - dev->node_props.simd_per_cu = cu->num_simd_per_cu; - dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; - if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) - dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; - pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, - cu->processor_id_low); -} - -/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ -static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) -{ - struct kfd_topology_device *dev; - int i = 0; - - BUG_ON(!cu); - - pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", - cu->proximity_domain, cu->hsa_capability); - list_for_each_entry(dev, &topology_device_list, list) { - if (cu->proximity_domain == i) { - if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) - kfd_populated_cu_info_cpu(dev, cu); - - if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) - kfd_populated_cu_info_gpu(dev, cu); - break; - } - i++; - } - - return 0; + return gpu_id; } -/* - * kfd_parse_subtype_mem is called when the topology mutex is - * already acquired - */ -static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) { - struct kfd_mem_properties *props; - struct kfd_topology_device *dev; - int i = 0; - - BUG_ON(!mem); - - pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", - mem->promixity_domain); - list_for_each_entry(dev, &topology_device_list, list) { - if (mem->promixity_domain == i) { - props = kfd_alloc_struct(props); - if (props == NULL) - return -ENOMEM; - - if (dev->node_props.cpu_cores_count == 0) - props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; - else - props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; - - if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) - props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; - if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) - props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; - - props->size_in_bytes = - ((uint64_t)mem->length_high << 32) + - mem->length_low; - props->width = mem->width; + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; - dev->mem_bank_count++; - list_add_tail(&props->list, &dev->mem_props); + down_read(&topology_lock); + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu && top_dev->gpu->pdev == pdev) { + device = top_dev->gpu; break; } - i++; - } - - return 0; -} -/* - * kfd_parse_subtype_cache is called when the topology mutex - * is already acquired - */ -static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) -{ - struct kfd_cache_properties *props; - struct kfd_topology_device *dev; - uint32_t id; - - BUG_ON(!cache); - - id = cache->processor_id_low; - - pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); - list_for_each_entry(dev, &topology_device_list, list) - if (id == dev->node_props.cpu_core_id_base || - id == dev->node_props.simd_id_base) { - props = kfd_alloc_struct(props); - if (props == NULL) - return -ENOMEM; - - props->processor_id_low = id; - props->cache_level = cache->cache_level; - props->cache_size = cache->cache_size; - props->cacheline_size = cache->cache_line_size; - props->cachelines_per_tag = cache->lines_per_tag; - props->cache_assoc = cache->associativity; - props->cache_latency = cache->cache_latency; - - if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) - props->cache_type |= HSA_CACHE_TYPE_DATA; - if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) - props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; - if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) - props->cache_type |= HSA_CACHE_TYPE_CPU; - if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) - props->cache_type |= HSA_CACHE_TYPE_HSACU; - - dev->cache_count++; - dev->node_props.caches_count++; - list_add_tail(&props->list, &dev->cache_props); - - break; - } + up_read(&topology_lock); - return 0; + return device; } -/* - * kfd_parse_subtype_iolink is called when the topology mutex - * is already acquired - */ -static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) +struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) { - struct kfd_iolink_properties *props; - struct kfd_topology_device *dev; - uint32_t i = 0; - uint32_t id_from; - uint32_t id_to; - - BUG_ON(!iolink); - - id_from = iolink->proximity_domain_from; - id_to = iolink->proximity_domain_to; + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; - pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); - list_for_each_entry(dev, &topology_device_list, list) { - if (id_from == i) { - props = kfd_alloc_struct(props); - if (props == NULL) - return -ENOMEM; - - props->node_from = id_from; - props->node_to = id_to; - props->ver_maj = iolink->version_major; - props->ver_min = iolink->version_minor; - - /* - * weight factor (derived from CDIR), currently always 1 - */ - props->weight = 1; - - props->min_latency = iolink->minimum_latency; - props->max_latency = iolink->maximum_latency; - props->min_bandwidth = iolink->minimum_bandwidth_mbs; - props->max_bandwidth = iolink->maximum_bandwidth_mbs; - props->rec_transfer_size = - iolink->recommended_transfer_size; - - dev->io_link_count++; - dev->node_props.io_links_count++; - list_add_tail(&props->list, &dev->io_link_props); + down_read(&topology_lock); + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu && top_dev->gpu->kgd == kgd) { + device = top_dev->gpu; break; } - i++; - } - - return 0; -} -static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) -{ - struct crat_subtype_computeunit *cu; - struct crat_subtype_memory *mem; - struct crat_subtype_cache *cache; - struct crat_subtype_iolink *iolink; - int ret = 0; - - BUG_ON(!sub_type_hdr); - - switch (sub_type_hdr->type) { - case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: - cu = (struct crat_subtype_computeunit *)sub_type_hdr; - ret = kfd_parse_subtype_cu(cu); - break; - case CRAT_SUBTYPE_MEMORY_AFFINITY: - mem = (struct crat_subtype_memory *)sub_type_hdr; - ret = kfd_parse_subtype_mem(mem); - break; - case CRAT_SUBTYPE_CACHE_AFFINITY: - cache = (struct crat_subtype_cache *)sub_type_hdr; - ret = kfd_parse_subtype_cache(cache); - break; - case CRAT_SUBTYPE_TLB_AFFINITY: - /* - * For now, nothing to do here - */ - pr_info("Found TLB entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: - /* - * For now, nothing to do here - */ - pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_IOLINK_AFFINITY: - iolink = (struct crat_subtype_iolink *)sub_type_hdr; - ret = kfd_parse_subtype_iolink(iolink); - break; - default: - pr_warn("Unknown subtype (%d) in CRAT\n", - sub_type_hdr->type); - } + up_read(&topology_lock); - return ret; + return device; } +/* Called with write topology_lock acquired */ static void kfd_release_topology_device(struct kfd_topology_device *dev) { struct kfd_mem_properties *mem; @@ -398,20 +151,22 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) sys_props.num_devices--; } -static void kfd_release_live_view(void) +void kfd_release_live_view(void) { struct kfd_topology_device *dev; + down_write(&topology_lock); while (topology_device_list.next != &topology_device_list) { dev = container_of(topology_device_list.next, struct kfd_topology_device, list); kfd_release_topology_device(dev); -} - + } + up_write(&topology_lock); memset(&sys_props, 0, sizeof(sys_props)); } -static struct kfd_topology_device *kfd_create_topology_device(void) +struct kfd_topology_device *kfd_create_topology_device( + struct list_head *device_list) { struct kfd_topology_device *dev; @@ -425,65 +180,12 @@ static struct kfd_topology_device *kfd_create_topology_device(void) INIT_LIST_HEAD(&dev->cache_props); INIT_LIST_HEAD(&dev->io_link_props); - list_add_tail(&dev->list, &topology_device_list); + list_add_tail(&dev->list, device_list); sys_props.num_devices++; return dev; } -static int kfd_parse_crat_table(void *crat_image) -{ - struct kfd_topology_device *top_dev; - struct crat_subtype_generic *sub_type_hdr; - uint16_t node_id; - int ret; - struct crat_header *crat_table = (struct crat_header *)crat_image; - uint16_t num_nodes; - uint32_t image_len; - - if (!crat_image) - return -EINVAL; - - num_nodes = crat_table->num_domains; - image_len = crat_table->length; - - pr_info("Parsing CRAT table with %d nodes\n", num_nodes); - - for (node_id = 0; node_id < num_nodes; node_id++) { - top_dev = kfd_create_topology_device(); - if (!top_dev) { - kfd_release_live_view(); - return -ENOMEM; - } - } - - sys_props.platform_id = - (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; - sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); - sys_props.platform_rev = crat_table->revision; - - sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); - while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < - ((char *)crat_image) + image_len) { - if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { - ret = kfd_parse_subtype(sub_type_hdr); - if (ret != 0) { - kfd_release_live_view(); - return ret; - } - } - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - } - - sys_props.generation_count++; - topology_crat_parsed = 1; - - return 0; -} - - #define sysfs_show_gen_prop(buffer, fmt, ...) \ snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) #define sysfs_show_32bit_prop(buffer, name, value) \ @@ -593,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, char *buffer) { ssize_t ret; - uint32_t i; + uint32_t i, j; struct kfd_cache_properties *cache; /* Making sure that the buffer is an empty string */ @@ -611,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); sysfs_show_32bit_prop(buffer, "type", cache->cache_type); snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); - for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) - ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", - buffer, cache->sibling_map[i], - (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? - "\n" : ","); - + for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) + for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { + /* Check each bit */ + if (cache->sibling_map[i] & (1 << j)) + ret = snprintf(buffer, PAGE_SIZE, + "%s%d%s", buffer, 1, ","); + else + ret = snprintf(buffer, PAGE_SIZE, + "%s%d%s", buffer, 0, ","); + } + /* Replace the last "," with end of line */ + *(buffer + strlen(buffer) - 1) = 0xA; return ret; } @@ -635,6 +343,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; uint32_t i; uint32_t log_max_watch_addr; + struct kfd_local_mem_info local_mem_info; /* Making sure that the buffer is an empty string */ buffer[0] = 0; @@ -665,16 +374,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, "simd_count", dev->node_props.simd_count); - if (dev->mem_bank_count < dev->node_props.mem_banks_count) { - pr_info_once("kfd: mem_banks_count truncated from %d to %d\n", - dev->node_props.mem_banks_count, - dev->mem_bank_count); - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->mem_bank_count); - } else { - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); - } + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); sysfs_show_32bit_prop(buffer, "caches_count", dev->node_props.caches_count); @@ -723,17 +424,30 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); } + if (dev->gpu->device_info->asic_family == CHIP_TONGA) + dev->node_props.capability |= + HSA_CAP_AQL_QUEUE_DOUBLE_MAP; + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", - dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( - dev->gpu->kgd)); + dev->node_props.max_engine_clk_fcompute); - sysfs_show_64bit_prop(buffer, "local_mem_size", - (unsigned long long int) 0); + /* + * If the ASIC is CZ, set local memory size to 0 to disable + * local memory support + */ + if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) { + dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, + &local_mem_info); + sysfs_show_64bit_prop(buffer, "local_mem_size", + local_mem_info.local_mem_size_private + + local_mem_info.local_mem_size_public); + } + else + sysfs_show_64bit_prop(buffer, "local_mem_size", + (unsigned long long int) 0); sysfs_show_32bit_prop(buffer, "fw_version", - dev->gpu->kfd2kgd->get_fw_version( - dev->gpu->kgd, - KGD_ENGINE_MEC1)); + dev->gpu->mec_fw_version); sysfs_show_32bit_prop(buffer, "capability", dev->node_props.capability); } @@ -928,6 +642,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, return 0; } +/* Called with write topology lock acquired */ static int kfd_build_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -944,6 +659,7 @@ static int kfd_build_sysfs_node_tree(void) return 0; } +/* Called with write topology lock acquired */ static void kfd_remove_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -1015,101 +731,221 @@ static void kfd_topology_release_sysfs(void) } } +/* Called with write topology_lock acquired */ +static int kfd_topology_update_device_list(struct list_head *temp_list, + struct list_head *master_list) +{ + int num = 0; + + while (!list_empty(temp_list)) { + list_move_tail(temp_list->next, master_list); + num++; + } + return num; +} + +static void kfd_debug_print_topology(void) +{ + struct kfd_topology_device *dev; + + down_read(&topology_lock); + + dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); + if (dev) { + if (dev->node_props.cpu_cores_count && dev->node_props.simd_count) { + pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", + dev->node_props.device_id, dev->node_props.vendor_id); + } + else if (dev->node_props.cpu_cores_count) + pr_info("Topology: Add CPU node\n"); + else if (dev->node_props.simd_count) + pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", + dev->node_props.device_id, dev->node_props.vendor_id); + } + up_read(&topology_lock); +} + +/* Helper function for intializing platform_xx members of kfd_system_properties + */ +static void kfd_update_system_properties(void) +{ + struct kfd_topology_device *dev; + + down_read(&topology_lock); + dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); + if (dev) { + sys_props.platform_id = + (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); + sys_props.platform_rev = dev->oem_revision; + } + up_read(&topology_lock); +} + +static void find_system_memory(const struct dmi_header *dm, + void *private) +{ + struct kfd_mem_properties *mem; + u16 mem_width, mem_clock; + struct kfd_topology_device *kdev = + (struct kfd_topology_device *)private; + const u8 *dmi_data = (const u8 *)(dm + 1); + + if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { + mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); + mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); + list_for_each_entry(mem, &kdev->mem_props, list) { + if (mem_width != 0xFFFF && mem_width != 0) + mem->width = mem_width; + if (mem_clock != 0) + mem->mem_clk_max = mem_clock; + } + } +} +/* kfd_add_non_crat_information - Add information that is not currently + * defined in CRAT but is necessary for KFD topology + * @dev - topology device to which addition info is added + */ +static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) +{ + /* Check if CPU only node. */ + if (kdev->gpu == NULL) { + /* Add system memory information */ + dmi_walk(find_system_memory, kdev); + } + /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ +} + int kfd_topology_init(void) { void *crat_image = NULL; size_t image_size = 0; int ret; + struct list_head temp_topology_device_list; + int cpu_only_node = 0; + struct kfd_topology_device *kdev; + int proximity_domain; + int num_nodes; + + /* topology_device_list - Master list of all topology devices + * temp_topology_device_list - temporary list created while parsing CRAT + * or VCRAT. Once parsing is complete the contents of list is moved to + * topology_device_list + */ - /* - * Initialize the head for the topology device list + /* Initialize the head for the both the lists */ INIT_LIST_HEAD(&topology_device_list); + INIT_LIST_HEAD(&temp_topology_device_list); init_rwsem(&topology_lock); - topology_crat_parsed = 0; memset(&sys_props, 0, sizeof(sys_props)); + /* Proximity domains in ACPI CRAT tables start counting at + * 0. The same should be true for virtual CRAT tables created + * at this stage. GPUs added later in kfd_topology_add_device + * use a counter. */ + proximity_domain = 0; + /* - * Get the CRAT image from the ACPI + * Get the CRAT image from the ACPI. If ACPI doesn't have one + * create a virtual CRAT. + * NOTE: The current implementation expects all AMD APUs to have + * CRAT. If no CRAT is available, it is assumed to be a CPU */ - ret = kfd_topology_get_crat_acpi(crat_image, &image_size); - if (ret == 0 && image_size > 0) { - pr_info("Found CRAT image with size=%zd\n", image_size); - crat_image = kmalloc(image_size, GFP_KERNEL); - if (!crat_image) { - ret = -ENOMEM; - pr_err("No memory for allocating CRAT image\n"); - goto err; - } - ret = kfd_topology_get_crat_acpi(crat_image, &image_size); - - if (ret == 0) { - down_write(&topology_lock); - ret = kfd_parse_crat_table(crat_image); - if (ret == 0) - ret = kfd_topology_update_sysfs(); - up_write(&topology_lock); - } else { - pr_err("Couldn't get CRAT table size from ACPI\n"); - } - kfree(crat_image); - } else if (ret == -ENODATA) { - ret = 0; - } else { - pr_err("Couldn't get CRAT table size from ACPI\n"); + ret = kfd_create_crat_image_acpi(&crat_image, &image_size); + if (ret != 0) { + ret = kfd_create_crat_image_virtual(&crat_image, &image_size, + COMPUTE_UNIT_CPU, NULL, + proximity_domain); + cpu_only_node = 1; + } + + if (ret == 0) + ret = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + else { + pr_err("Error getting/creating CRAT table\n"); + goto err; + } + + down_write(&topology_lock); + num_nodes = kfd_topology_update_device_list(&temp_topology_device_list, + &topology_device_list); + atomic_set(&topology_crat_proximity_domain, num_nodes-1); + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + + if (ret == 0) { + sys_props.generation_count++; + kfd_update_system_properties(); + kfd_debug_print_topology(); + pr_info("Finished initializing topology\n"); + } + else + pr_err("Failed to update topology in sysfs ret=%d\n", ret); + + /* For nodes with GPU, this information gets added + * when GPU is detected (kfd_topology_add_device). */ + if (cpu_only_node) { + /* Add additional information to CPU only node created above */ + down_write(&topology_lock); + kdev = list_first_entry(&topology_device_list, + struct kfd_topology_device, list); + up_write(&topology_lock); + kfd_add_non_crat_information(kdev); } err: - pr_info("Finished initializing topology ret=%d\n", ret); + kfd_destroy_crat_image(crat_image); return ret; } void kfd_topology_shutdown(void) { + down_write(&topology_lock); kfd_topology_release_sysfs(); + up_write(&topology_lock); kfd_release_live_view(); } -static void kfd_debug_print_topology(void) -{ - struct kfd_topology_device *dev; - uint32_t i = 0; - - pr_info("DEBUG PRINT OF TOPOLOGY:"); - list_for_each_entry(dev, &topology_device_list, list) { - pr_info("Node: %d\n", i); - pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); - pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); - pr_info("\tSIMD count: %d", dev->node_props.simd_count); - i++; - } -} - static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) { uint32_t hashout; uint32_t buf[7]; + uint64_t local_mem_size; int i; + struct kfd_local_mem_info local_mem_info; if (!gpu) return 0; + gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); + + local_mem_size = local_mem_info.local_mem_size_private + + local_mem_info.local_mem_size_public; + buf[0] = gpu->pdev->devfn; buf[1] = gpu->pdev->subsystem_vendor; buf[2] = gpu->pdev->subsystem_device; buf[3] = gpu->pdev->device; buf[4] = gpu->pdev->bus->number; - buf[5] = (uint32_t)(gpu->kfd2kgd->get_vmem_size(gpu->kgd) - & 0xffffffff); - buf[6] = (uint32_t)(gpu->kfd2kgd->get_vmem_size(gpu->kgd) >> 32); + buf[5] = lower_32_bits(local_mem_size); + buf[6] = upper_32_bits(local_mem_size); for (i = 0, hashout = 0; i < 7; i++) hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); return hashout; } - +/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If + * the GPU device is not already present in the topology device list + * then return NULL. This means a new topology device has to be + * created for this GPU. + * TODO: Rather than assiging @gpu to first topology device withtout + * gpu attached, it will better to have more stringent check. + */ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) { struct kfd_topology_device *dev; @@ -1117,13 +953,14 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) BUG_ON(!gpu); + down_write(&topology_lock); list_for_each_entry(dev, &topology_device_list, list) if (dev->gpu == NULL && dev->node_props.simd_count > 0) { dev->gpu = gpu; out_dev = dev; break; } - + up_write(&topology_lock); return out_dev; } @@ -1135,70 +972,146 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) */ } +/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, + * patch this after CRAT parsing. + */ +static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) +{ + struct kfd_mem_properties *mem; + struct kfd_local_mem_info local_mem_info; + + if (dev == NULL) + return; + + /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with + * single bank of VRAM local memory. + * for dGPUs - VCRAT reports only one bank of Local Memory + * for APUs - If CRAT from ACPI reports more than one bank, then + * all the banks will report the same mem_clk_max information + */ + dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, + &local_mem_info); + + list_for_each_entry(mem, &dev->mem_props, list) + mem->mem_clk_max = local_mem_info.mem_clk_max; +} + int kfd_topology_add_device(struct kfd_dev *gpu) { uint32_t gpu_id; struct kfd_topology_device *dev; - int res; + struct kfd_cu_info cu_info; + int res = 0; + struct list_head temp_topology_device_list; + void *crat_image = NULL; + size_t image_size = 0; + int proximity_domain; BUG_ON(!gpu); + INIT_LIST_HEAD(&temp_topology_device_list); + gpu_id = kfd_generate_gpu_id(gpu); pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - down_write(&topology_lock); - /* - * Try to assign the GPU to existing topology device (generated from - * CRAT table + proximity_domain = atomic_inc_return(& + topology_crat_proximity_domain); + + /* Check to see if this gpu device exists in the topology_device_list. + * If so, assign the gpu to that device, + * else create a Virtual CRAT for this gpu device and then parse that CRAT + * to create a new topology device. Once created assign the gpu to that + * topology device */ dev = kfd_assign_gpu(gpu); if (!dev) { - pr_info("GPU was not found in the current topology. Extending.\n"); - kfd_debug_print_topology(); - dev = kfd_create_topology_device(); - if (!dev) { - res = -ENOMEM; + res = kfd_create_crat_image_virtual(&crat_image, &image_size, + COMPUTE_UNIT_GPU, + gpu, proximity_domain); + if (res == 0) + res = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, proximity_domain); + else { + pr_err("Error in VCRAT for GPU (ID: 0x%x)\n", gpu_id); goto err; } - dev->gpu = gpu; - /* - * TODO: Make a call to retrieve topology information from the - * GPU vBIOS - */ + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, + &topology_device_list); /* * Update the SYSFS tree, since we added another topology device */ - if (kfd_topology_update_sysfs() < 0) - kfd_topology_release_sysfs(); - + res = kfd_topology_update_sysfs(); + up_write(&topology_lock); + + if (res == 0) + sys_props.generation_count++; + else + pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", + gpu_id, res); + dev = kfd_assign_gpu(gpu); + BUG_ON(!dev); } dev->gpu_id = gpu_id; gpu->id = gpu_id; + + /* TODO: Move the following lines to function + * kfd_add_non_crat_information */ + + /* Fill-in additional information that is not available in CRAT but + * needed for the topology */ + + dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); + dev->node_props.simd_arrays_per_engine = cu_info.num_shader_arrays_per_engine; + dev->node_props.vendor_id = gpu->pdev->vendor; dev->node_props.device_id = gpu->pdev->device; - dev->node_props.location_id = (gpu->pdev->bus->number << 24) + - (gpu->pdev->devfn & 0xffffff); - /* - * TODO: Retrieve max engine clock values from KGD - */ - - if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { - dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; - pr_info("amdkfd: adding doorbell packet type capability\n"); + dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, + gpu->pdev->devfn); + dev->node_props.max_engine_clk_fcompute = + dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); + dev->node_props.max_engine_clk_ccompute = + cpufreq_quick_get_max(0) / 1000; + + kfd_fill_mem_clk_max_info(dev); + + switch (dev->gpu->device_info->asic_family) { + case CHIP_KAVERI: + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; + case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: + pr_debug("amdkfd: adding doorbell packet type capability\n"); + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; } - res = 0; + /* Fix errors in CZ CRAT. + * simd_count: Carrizo CRAT reports wrong simd_count, probably because it + * doesn't consider masked out CUs + * capability flag: Carrizo CRAT doesn't report IOMMU flags. + */ + if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { + dev->node_props.simd_count = + cu_info.simd_per_cu * cu_info.cu_active_number; + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + } + kfd_debug_print_topology(); err: - up_write(&topology_lock); - if (res == 0) kfd_notify_gpu_change(gpu_id, 1); + kfd_destroy_crat_image(crat_image); return res; } @@ -1231,22 +1144,26 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) return res; } -/* - * When idx is out of bounds, the function will return NULL +/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD + * topology. If GPU device is found @idx, then valid kfd_dev pointer is + * returned through @kdev + * Return - 0: On success (@kdev will be NULL for non GPU nodes) + * -1: If end of list */ -struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) +int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) { struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; uint8_t device_idx = 0; + *kdev = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) { if (device_idx == idx) { - device = top_dev->gpu; - break; + *kdev = top_dev->gpu; + up_read(&topology_lock); + return 0; } device_idx++; @@ -1254,6 +1171,57 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) up_read(&topology_lock); - return device; + return -1; + +} + +static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) +{ + const struct cpuinfo_x86 *cpuinfo; + int first_cpu_of_nuna_node; + + if (cpumask == NULL || cpumask == cpu_none_mask) + return -1; + first_cpu_of_nuna_node = cpumask_first(cpumask); + cpuinfo = &cpu_data(first_cpu_of_nuna_node); + + return cpuinfo->apicid; +} + +/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor + * of the given NUMA node (numa_node_id) + * Return -1 on failure + */ +int kfd_numa_node_to_apic_id(int numa_node_id) +{ + if (numa_node_id == -1) { + pr_warn("Invalid NUMA Node. Use online CPU mask\n"); + return kfd_cpumask_to_apic_id(cpu_online_mask); + } + return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); +} + +/* kfd_get_proximity_domain - Find proximity_domain (node id) to which + * given PCI bus belongs to. CRAT table contains only the APIC ID + * of the parent NUMA node. So use that as the search parameter. + * Return -1 on failure + */ +int kfd_get_proximity_domain(const struct pci_bus *bus) +{ + struct kfd_topology_device *dev; + int proximity_domain = -1; + + down_read(&topology_lock); + + list_for_each_entry(dev, &topology_device_list, list) + if (dev->node_props.cpu_cores_count && + dev->node_props.cpu_core_id_base == + kfd_cpumask_to_apic_id(cpumask_of_pcibus(bus))) { + proximity_domain = dev->proximity_domain; + break; + } + + up_read(&topology_lock); + return proximity_domain; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index c3ddb9b95ff8..ab28188b492e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -39,8 +39,16 @@ #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 -#define HSA_CAP_RESERVED 0xfffff000 +#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 +#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 +#define HSA_CAP_RESERVED 0xffffc000 + +#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 +#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 #define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 +#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 struct kfd_node_properties { uint32_t cpu_cores_count; @@ -91,8 +99,6 @@ struct kfd_mem_properties { struct attribute attr; }; -#define KFD_TOPOLOGY_CPU_SIBLINGS 256 - #define HSA_CACHE_TYPE_DATA 0x00000001 #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 #define HSA_CACHE_TYPE_CPU 0x00000004 @@ -109,7 +115,7 @@ struct kfd_cache_properties { uint32_t cache_assoc; uint32_t cache_latency; uint32_t cache_type; - uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; struct kobject *kobj; struct attribute attr; }; @@ -135,8 +141,8 @@ struct kfd_iolink_properties { struct kfd_topology_device { struct list_head list; uint32_t gpu_id; + uint32_t proximity_domain; struct kfd_node_properties node_props; - uint32_t mem_bank_count; struct list_head mem_props; uint32_t cache_count; struct list_head cache_props; @@ -150,6 +156,9 @@ struct kfd_topology_device { struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; }; struct kfd_system_properties { @@ -164,6 +173,8 @@ struct kfd_system_properties { struct attribute attr_props; }; - +struct kfd_topology_device *kfd_create_topology_device( + struct list_head *device_list); +void kfd_release_live_view(void); #endif /* __KFD_TOPOLOGY_H__ */ |